From 694abe2004c299dae81ed97ebcbf75e79d605092 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:00:39 +0100 Subject: [PATCH 1/3] docs(telemetry): add thread-safety comments to stop() and sdkProvider_.reset() Co-Authored-By: Claude Opus 4.6 --- src/libxrpl/telemetry/Telemetry.cpp | 7 +++++++ src/xrpld/app/main/Application.cpp | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/src/libxrpl/telemetry/Telemetry.cpp b/src/libxrpl/telemetry/Telemetry.cpp index f7fb64d5dd..3b212dc4fa 100644 --- a/src/libxrpl/telemetry/Telemetry.cpp +++ b/src/libxrpl/telemetry/Telemetry.cpp @@ -315,6 +315,13 @@ public: // Force flush with timeout to avoid blocking indefinitely // when the OTLP endpoint is unreachable. sdkProvider_->ForceFlush(std::chrono::milliseconds(5000)); + // TODO: sdkProvider_ is not thread-safe. This reset() races with + // getTracer() if any thread is still calling startSpan(). + // Currently safe because Application::stop() shuts down + // serverHandler_, overlay_, and jobQueue_ before calling + // telemetry_->stop() — so no callers should remain. If the + // shutdown order ever changes, add an std::atomic stopped_ + // flag checked in getTracer() to make this robust. sdkProvider_.reset(); trace_api::Provider::SetTracerProvider( opentelemetry::nostd::shared_ptr( diff --git a/src/xrpld/app/main/Application.cpp b/src/xrpld/app/main/Application.cpp index e222660c39..cad96f382b 100644 --- a/src/xrpld/app/main/Application.cpp +++ b/src/xrpld/app/main/Application.cpp @@ -1661,6 +1661,10 @@ ApplicationImp::run() ledgerCleaner_->stop(); m_nodeStore->stop(); perfLog_->stop(); + // Telemetry must stop last among trace-producing components. + // serverHandler_, overlay_, and jobQueue_ are already stopped above, + // so no threads should be calling startSpan() at this point. + // See TODO in TelemetryImpl::stop() re: thread-safety of sdkProvider_. telemetry_->stop(); JLOG(m_journal.info()) << "Done."; From b7c9e5775e5f8d58d26d3df51c3c6ebde3dc900b Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:00:39 +0100 Subject: [PATCH 2/3] feat(telemetry): add toDisplayString() and use Title Case in consensus attributes Co-Authored-By: Claude Opus 4.6 --- src/xrpld/app/consensus/RCLConsensus.cpp | 8 ++++---- src/xrpld/consensus/ConsensusTypes.h | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/xrpld/app/consensus/RCLConsensus.cpp b/src/xrpld/app/consensus/RCLConsensus.cpp index 9d386d2602..3901fab87e 100644 --- a/src/xrpld/app/consensus/RCLConsensus.cpp +++ b/src/xrpld/app/consensus/RCLConsensus.cpp @@ -342,7 +342,7 @@ RCLConsensus::Adaptor::onClose( span.setAttribute( telemetry::cons_span::attr::ledgerSeq, static_cast(ledger.ledger_->header().seq + 1)); - span.setAttribute(telemetry::cons_span::attr::mode, to_string(mode).c_str()); + span.setAttribute(telemetry::cons_span::attr::mode, toDisplayString(mode).c_str()); bool const wrongLCL = mode == ConsensusMode::wrongLedger; bool const proposing = mode == ConsensusMode::proposing; @@ -995,8 +995,8 @@ RCLConsensus::Adaptor::onModeChange(ConsensusMode before, ConsensusMode after) telemetry::TraceCategory::Consensus, telemetry::seg::consensus, telemetry::cons_span::op::modeChange); - span.setAttribute(telemetry::cons_span::attr::modeOld, to_string(before).c_str()); - span.setAttribute(telemetry::cons_span::attr::modeNew, to_string(after).c_str()); + span.setAttribute(telemetry::cons_span::attr::modeOld, toDisplayString(before).c_str()); + span.setAttribute(telemetry::cons_span::attr::modeNew, toDisplayString(after).c_str()); JLOG(j_.info()) << "Consensus mode change before=" << to_string(before) << ", after=" << to_string(after); @@ -1195,7 +1195,7 @@ RCLConsensus::Adaptor::startRoundTracing(RCLCxLedger const& prevLgr) roundSpan_->setAttribute(cons_span::attr::ledgerId, to_string(prevLgr.id()).c_str()); roundSpan_->setAttribute(cons_span::attr::ledgerSeq, static_cast(prevLgr.seq() + 1)); - roundSpan_->setAttribute(cons_span::attr::mode, to_string(mode_.load()).c_str()); + roundSpan_->setAttribute(cons_span::attr::mode, toDisplayString(mode_.load()).c_str()); roundSpan_->setAttribute(cons_span::attr::traceStrategy, strategy.c_str()); roundSpan_->setAttribute(cons_span::attr::roundId, static_cast(prevLgr.seq() + 1)); diff --git a/src/xrpld/consensus/ConsensusTypes.h b/src/xrpld/consensus/ConsensusTypes.h index 8a81211722..bfbcddcb42 100644 --- a/src/xrpld/consensus/ConsensusTypes.h +++ b/src/xrpld/consensus/ConsensusTypes.h @@ -66,6 +66,26 @@ to_string(ConsensusMode m) } } +/// Title Case display name for telemetry attributes and dashboards. +/// Separate from to_string() which is used in logs and must remain stable. +inline std::string +toDisplayString(ConsensusMode m) +{ + switch (m) + { + case ConsensusMode::proposing: + return "Proposing"; + case ConsensusMode::observing: + return "Observing"; + case ConsensusMode::wrongLedger: + return "Wrong Ledger"; + case ConsensusMode::switchedLedger: + return "Switched Ledger"; + default: + return "Unknown"; + } +} + /** Phases of consensus for a single ledger round. @code From 0dec657c61d88c8c107f8e58764fffd0481c2926 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:00:40 +0100 Subject: [PATCH 3/3] fix(telemetry): rename dashboard provider to xrpld, replace Jaeger with Tempo troubleshooting Co-Authored-By: Claude Opus 4.6 --- .../grafana/provisioning/dashboards/dashboards.yaml | 4 ++-- docs/telemetry-runbook.md | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docker/telemetry/grafana/provisioning/dashboards/dashboards.yaml b/docker/telemetry/grafana/provisioning/dashboards/dashboards.yaml index 6aeaff31e6..dec8dc87c0 100644 --- a/docker/telemetry/grafana/provisioning/dashboards/dashboards.yaml +++ b/docker/telemetry/grafana/provisioning/dashboards/dashboards.yaml @@ -1,9 +1,9 @@ apiVersion: 1 providers: - - name: rippled-telemetry + - name: xrpld-telemetry orgId: 1 - folder: rippled + folder: xrpld type: file disableDeletion: false editable: true diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index 1ff7c4a51b..1772e4bb43 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -241,12 +241,14 @@ Three dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: ## Troubleshooting -### No traces appearing in Jaeger +### No traces appearing in Tempo 1. Check xrpld logs for `Telemetry starting` message 2. Verify `enabled=1` in the `[telemetry]` config section 3. Test collector connectivity: `curl -v http://localhost:4318/v1/traces` -4. Check collector logs: `docker compose logs otel-collector` +4. Check collector logs: `docker compose -f docker/telemetry/docker-compose.yml logs otel-collector` +5. Verify Tempo is receiving data: open Grafana → Explore → select Tempo datasource → search by `service.name = xrpld` +6. Check Tempo logs: `docker compose -f docker/telemetry/docker-compose.yml logs tempo` ### High memory usage