From dc5bb4b35cea3a5663f4c1d65189ea2a1387b23a Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 18:29:29 +0100 Subject: [PATCH 01/10] feat(telemetry): emit xrpld_validation_{agreements,missed}_total counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the two previously-registered-but-never-incremented validation counters to ValidationTracker's gross lifetime tallies, exported as monotonic ObservableCounters. New gross atomics count each ledger once at first classification and are never adjusted on late repair, keeping the _total counters monotonic and additive (agreements_total + missed_total == ledgers reconciled); the repair-aware windowed view stays on the existing xrpld_validation_agreement gauge. The validator-health dashboard panels that already query these names now render data instead of "No data". Also de-stale 09-data-collection-reference.md: §5b documented flat metric names (xrpld_cache_SLE_hit_rate, ...) that the code never emits — it emits labeled gauges (xrpld_cache_metrics{metric="SLE_hit_rate"}). Replace the stale flat-name tables with a pointer to the canonical labeled section, reconcile the contradictory headline counts, and correct xrpld_job_count to its real exported name xrpld_jobq_job_count. Adds two GTests asserting gross tallies stay frozen on repair while net totals move, plus the additive invariant. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../09-data-collection-reference.md | 150 ++++++------------ .../libxrpl/telemetry/ValidationTracker.cpp | 90 +++++++++++ src/xrpld/telemetry/MetricsRegistry.cpp | 74 +++++++-- src/xrpld/telemetry/MetricsRegistry.h | 19 ++- src/xrpld/telemetry/ValidationTracker.h | 45 +++++- .../telemetry/detail/ValidationTracker.cpp | 29 +++- 6 files changed, 288 insertions(+), 119 deletions(-) diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md index fc59db1024..b4221857ab 100644 --- a/OpenTelemetryPlan/09-data-collection-reference.md +++ b/OpenTelemetryPlan/09-data-collection-reference.md @@ -337,7 +337,7 @@ prefix=xrpld | `xrpld_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp | Active outbound peer connections | 10–21 | | `xrpld_Overlay_Peer_Disconnects` | OverlayImpl.cpp | Cumulative peer disconnection count | Low growth | | `xrpld_Overlay_Peer_Disconnects_Charges` | OverlayImpl.cpp | Disconnects due to resource limit charges | Low growth (subset of above) | -| `xrpld_job_count` | JobQueue.cpp | Current job queue depth | 0–100 (healthy) | +| `xrpld_jobq_job_count` | JobQueue.cpp | Current job queue depth (group `jobq`) | 0–100 (healthy) | **Grafana dashboard**: _Node Health (System Metrics)_ (`xrpld-system-node-health`) @@ -592,90 +592,22 @@ count_over_time({job="xrpld"} |= "trace_id=" [5m]) --- -## 5b. Future: Internal Metric Gap Fill (Phase 9) +## 5b. Internal Metric Gap Fill (Phase 9) -> **Status**: Planned, not yet implemented. +> **Status**: Implemented. > **Plan details**: [06-implementation-phases.md §6.8.2](./06-implementation-phases.md) — motivation, architecture, third-party context > **Task breakdown**: [Phase9_taskList.md](./Phase9_taskList.md) — per-task implementation details -Phase 9 fills ~50+ metrics that exist inside xrpld but currently lack time-series export. Uses a hybrid approach: `beast::insight` extensions for NodeStore I/O, OTel `ObservableGauge` async callbacks for new categories. +Phase 9 fills the metrics that exist inside xrpld but previously lacked time-series export. It +uses a hybrid approach: `beast::insight` extensions for NodeStore I/O plus OTel `ObservableGauge` +async callbacks for new categories. -### New Metric Categories - -#### NodeStore I/O (via beast::insight) - -| Prometheus Metric | Type | Description | -| ---------------------------------- | ----- | ----------------------------------- | -| `xrpld_nodestore_reads_total` | Gauge | Cumulative read operations | -| `xrpld_nodestore_reads_hit` | Gauge | Cache-served reads | -| `xrpld_nodestore_writes` | Gauge | Cumulative write operations | -| `xrpld_nodestore_written_bytes` | Gauge | Cumulative bytes written | -| `xrpld_nodestore_read_bytes` | Gauge | Cumulative bytes read | -| `xrpld_nodestore_read_duration_us` | Gauge | Cumulative read time (microseconds) | -| `xrpld_nodestore_write_load` | Gauge | Current write load score | -| `xrpld_nodestore_read_queue` | Gauge | Items in read queue | - -#### Cache Hit Rates (via OTel MetricsRegistry) - -| Prometheus Metric | Type | Description | -| ----------------------------- | ----- | ------------------------------------ | -| `xrpld_cache_SLE_hit_rate` | Gauge | SLE cache hit rate (0.0-1.0) | -| `xrpld_cache_ledger_hit_rate` | Gauge | Ledger object cache hit rate | -| `xrpld_cache_AL_hit_rate` | Gauge | AcceptedLedger cache hit rate | -| `xrpld_cache_treenode_size` | Gauge | SHAMap TreeNode cache size (entries) | -| `xrpld_cache_fullbelow_size` | Gauge | FullBelow cache size | - -#### Transaction Queue (via OTel MetricsRegistry) - -| Prometheus Metric | Type | Description | -| ------------------------------------ | ----- | -------------------------------- | -| `xrpld_txq_count` | Gauge | Current transactions in queue | -| `xrpld_txq_max_size` | Gauge | Maximum queue capacity | -| `xrpld_txq_in_ledger` | Gauge | Transactions in open ledger | -| `xrpld_txq_per_ledger` | Gauge | Expected transactions per ledger | -| `xrpld_txq_open_ledger_fee_level` | Gauge | Open ledger fee escalation level | -| `xrpld_txq_med_fee_level` | Gauge | Median fee level in queue | -| `xrpld_txq_reference_fee_level` | Gauge | Reference fee level | -| `xrpld_txq_min_processing_fee_level` | Gauge | Minimum fee to get processed | - -#### PerfLog Per-RPC Method (via OTel Metrics SDK) - -| Prometheus Metric | Type | Labels | Description | -| ------------------------------------- | --------- | ----------------- | --------------------------- | -| `xrpld_rpc_method_started_total` | Counter | `method=""` | RPC calls started | -| `xrpld_rpc_method_finished_total` | Counter | `method=""` | RPC calls completed | -| `xrpld_rpc_method_errored_total` | Counter | `method=""` | RPC calls errored | -| `xrpld_rpc_method_duration_us_bucket` | Histogram | `method=""` | Execution time distribution | - -#### PerfLog Per-Job Type (via OTel Metrics SDK) - -| Prometheus Metric | Type | Labels | Description | -| -------------------------------------- | --------- | ------------------- | --------------- | -| `xrpld_job_queued_total` | Counter | `job_type=""` | Jobs queued | -| `xrpld_job_started_total` | Counter | `job_type=""` | Jobs started | -| `xrpld_job_finished_total` | Counter | `job_type=""` | Jobs completed | -| `xrpld_job_queued_duration_us_bucket` | Histogram | `job_type=""` | Queue wait time | -| `xrpld_job_running_duration_us_bucket` | Histogram | `job_type=""` | Execution time | - -#### Counted Object Instances (via OTel MetricsRegistry) - -| Prometheus Metric | Type | Labels | Description | -| -------------------- | ----- | --------------- | ------------------------------- | -| `xrpld_object_count` | Gauge | `type=""` | Live instances of internal type | - -Tracked types: `Transaction`, `Ledger`, `NodeObject`, `STTx`, `STLedgerEntry`, `InboundLedger`, `Pathfinder`, `PathRequest`, `HashRouterEntry` - -#### Fee Escalation & Load Factors (via OTel MetricsRegistry) - -| Prometheus Metric | Type | Description | -| ---------------------------------- | ----- | ------------------------------------ | -| `xrpld_load_factor` | Gauge | Combined transaction cost multiplier | -| `xrpld_load_factor_server` | Gauge | Server + cluster + network load | -| `xrpld_load_factor_local` | Gauge | Local server load only | -| `xrpld_load_factor_net` | Gauge | Network-wide load estimate | -| `xrpld_load_factor_cluster` | Gauge | Cluster peer load | -| `xrpld_load_factor_fee_escalation` | Gauge | Open ledger fee escalation | -| `xrpld_load_factor_fee_queue` | Gauge | Queue entry fee level | +> **Authoritative metric names live in [§ Phase 9: OTel SDK-Exported Metrics](#phase-9-otel-sdk-exported-metrics-metricsregistry) below.** +> Most internal metrics are emitted as **labeled** gauges — one instrument carrying many logical +> values via a `metric` label (e.g. `xrpld_cache_metrics{metric="SLE_hit_rate"}`, +> `xrpld_txq_metrics{metric="txq_count"}`, `xrpld_load_factor_metrics{metric="load_factor"}`, +> `xrpld_nodestore_state{metric="node_reads_total"}`) — not the flat per-name form. Query the +> labeled names; the flat names (`xrpld_cache_SLE_hit_rate`, `xrpld_txq_count`, …) are **not** emitted. #### Server Info (via OTel MetricsRegistry) @@ -746,15 +678,23 @@ Phase 10 builds a 5-node validator docker-compose harness with RPC load generato ### Validated Telemetry Inventory -| Category | Expected Count | Validation Method | -| ------------------ | -------------- | -------------------------------- | -| Trace spans | 16 | Jaeger/Tempo API query | -| Span attributes | 22 | Per-span attribute assertion | -| StatsD metrics | 255+ | Prometheus query | -| Phase 9 metrics | 68+ | Prometheus query | -| SpanMetrics RED | 4 per span | Prometheus query | -| Grafana dashboards | 10 | Dashboard API "no data" check | -| Log-trace links | Present | Loki query + Tempo reverse check | +> **Counting note — families vs series.** A _metric family_ is one distinct Prometheus `__name__` +> (histogram `_bucket`/`_count`/`_sum` collapsed to one). A _series_ is a family × its label +> combinations. The legacy overlay-traffic block is the bulk of the count: ~56 message categories × +> 4 (`_Bytes_In/_Out`, `_Messages_In/_Out`) ≈ 224 families on its own. The labeled gauges +> (`xrpld_cache_metrics{metric}`, …) are few families but many series. Validate against the figures +> below as **families currently emitting** (idle nodes under-report — workload-gated metrics such as +> per-RPC/error counters appear only once exercised, which is Phase 10's purpose). + +| Category | Expected Count | Validation Method | +| ------------------------- | ------------------- | -------------------------------- | +| Trace spans | 16 | Jaeger/Tempo API query | +| Span attributes | 22 | Per-span attribute assertion | +| Legacy `xrpld_*` families | ~270 (≈224 traffic) | Prometheus `__name__` query | +| Native MetricsRegistry | 35 instruments | Prometheus query | +| SpanMetrics RED | 4 per span | Prometheus query | +| Grafana dashboards | 10 | Dashboard API "no data" check | +| Log-trace links | Present | Loki query + Tempo reverse check | --- @@ -998,15 +938,27 @@ State value encoding: 0=disconnected, 1=connected, 2=syncing, 3=tracking, 4=full #### Synchronous Counters (Phase 7+) -| Prometheus Metric | Type | Description | Increment Site | -| ----------------------------------- | ------- | -------------------------------- | --------------------- | -| `xrpld_ledgers_closed_total` | Counter | Ledgers closed by consensus | RCLConsensus.cpp | -| `xrpld_validations_sent_total` | Counter | Validations sent | RCLConsensus.cpp | -| `xrpld_validations_checked_total` | Counter | Network validations observed | LedgerMaster.cpp | -| `xrpld_validation_agreements_total` | Counter | Cumulative validation agreements | ValidationTracker.cpp | -| `xrpld_validation_missed_total` | Counter | Cumulative validation misses | ValidationTracker.cpp | -| `xrpld_state_changes_total` | Counter | Operating mode transitions | NetworkOPs.cpp | -| `xrpld_jq_trans_overflow_total` | Counter | Job queue transaction overflows | JobQueue.cpp | +| Prometheus Metric | Type | Description | Increment Site | +| --------------------------------- | ------- | ------------------------------- | ---------------- | +| `xrpld_ledgers_closed_total` | Counter | Ledgers closed by consensus | RCLConsensus.cpp | +| `xrpld_validations_sent_total` | Counter | Validations sent | RCLConsensus.cpp | +| `xrpld_validations_checked_total` | Counter | Network validations observed | LedgerMaster.cpp | +| `xrpld_state_changes_total` | Counter | Operating mode transitions | NetworkOPs.cpp | +| `xrpld_jq_trans_overflow_total` | Counter | Job queue transaction overflows | JobQueue.cpp | + +Lifetime validation agreement/miss tallies are exported as monotonic **ObservableCounters** +(not synchronous counters) observed from `ValidationTracker`'s gross lifetime totals: + +| Prometheus Metric | Type | Description | Source | +| ----------------------------------- | ----------------- | ------------------------------------------ | --------------------- | +| `xrpld_validation_agreements_total` | ObservableCounter | Lifetime validations that initially agreed | ValidationTracker.cpp | +| `xrpld_validation_missed_total` | ObservableCounter | Lifetime validations that initially missed | ValidationTracker.cpp | + +> **Counting semantics (initial-classification only):** each reconciled ledger increments exactly +> one of these two counters, at first classification. A later late-repair (miss → agreement) does +> **not** move either counter — keeping both strictly monotonic (a Prometheus `_total` must never +> decrease) and additive (`agreements_total + missed_total` = ledgers reconciled). The +> repair-aware, windowed view remains on `xrpld_validation_agreement{metric="…"}`. #### Span Attribute Enrichments (Phases 2-4) @@ -1071,7 +1023,7 @@ State value encoding: 0=disconnected, 1=connected, 2=syncing, 3=tracking, 4=full | Issue | Impact | Status | | ------------------------------------------------------------------ | ------------------------------------------------ | -------------------------------------------------------------------- | | `warn` and `drop` metrics use non-standard StatsD `\|m` meter type | Metrics silently dropped by OTel StatsD receiver | Phase 6 Task 6.1 — needs `\|m` → `\|c` change in StatsDCollector.cpp | -| `xrpld_job_count` may not emit in standalone mode | Missing from Prometheus in some test configs | Requires active job queue activity | +| `xrpld_jobq_job_count` may not emit in standalone mode | Missing from Prometheus in some test configs | Requires active job queue activity | | `xrpld_rpc_requests` depends on `[insight]` config | Zero series if StatsD not configured | Requires `[insight] server=statsd` in xrpld.cfg | | Peer tracing disabled by default | No `peer.*` spans unless `trace_peer=1` | Intentional — high volume on mainnet | diff --git a/src/tests/libxrpl/telemetry/ValidationTracker.cpp b/src/tests/libxrpl/telemetry/ValidationTracker.cpp index 7a5179c871..d2b96aa616 100644 --- a/src/tests/libxrpl/telemetry/ValidationTracker.cpp +++ b/src/tests/libxrpl/telemetry/ValidationTracker.cpp @@ -132,6 +132,8 @@ TEST_F(ValidationTrackerTest, EmptyWindowReturnsZero) EXPECT_EQ(tracker_.missed24h(), 0u); EXPECT_EQ(tracker_.totalAgreements(), 0u); EXPECT_EQ(tracker_.totalMissed(), 0u); + EXPECT_EQ(tracker_.totalAgreementsEver(), 0u); + EXPECT_EQ(tracker_.totalMissedEver(), 0u); EXPECT_EQ(tracker_.totalValidationsSent(), 0u); EXPECT_EQ(tracker_.totalValidationsChecked(), 0u); } @@ -282,3 +284,91 @@ TEST_F(ValidationTrackerTest, OnlyWeValidated) EXPECT_EQ(tracker_.missed1h(), 1u); EXPECT_DOUBLE_EQ(tracker_.agreementPct1h(), 0.0); } + +// --------------------------------------------------------------- +// 10. Gross miss tally is monotonic across a late repair +// The gross lifetime tallies (totalAgreementsEver/totalMissedEver) +// back the monotonic Prometheus _total counters. A late repair must +// move the NET totals (miss -> agreement) but must NOT move the gross +// tallies: a miss already counted stays counted, and the repair does +// not add a second (agreement) count for the same ledger. +// --------------------------------------------------------------- +TEST_F(ValidationTrackerTest, GrossMissedNeverDecrementsOnRepair) +{ + auto const hash = makeHash(10); + LedgerIndex const seq = 1000; + + // Network validates, we do not (yet). + tracker_.recordNetworkValidation(hash, seq); + + // Grace period elapses -- reconciled as a miss. + std::this_thread::sleep_for(std::chrono::seconds(9)); + tracker_.reconcile(); + + // Net and gross both show exactly one initial miss, zero agreements. + EXPECT_EQ(tracker_.totalMissed(), 1u); + EXPECT_EQ(tracker_.totalMissedEver(), 1u); + EXPECT_EQ(tracker_.totalAgreements(), 0u); + EXPECT_EQ(tracker_.totalAgreementsEver(), 0u); + + // Late arrival of our validation repairs the miss to an agreement. + tracker_.recordOurValidation(hash, seq); + tracker_.reconcile(); + + // Net totals reflect the repair... + EXPECT_EQ(tracker_.totalMissed(), 0u); + EXPECT_EQ(tracker_.totalAgreements(), 1u); + // ...but the gross tallies are frozen at first classification: the miss + // stays counted and no agreement was added (repair path excluded). + EXPECT_EQ(tracker_.totalMissedEver(), 1u); + EXPECT_EQ(tracker_.totalAgreementsEver(), 0u); +} + +// --------------------------------------------------------------- +// 11. Gross tallies count initial classification only (additive) +// With a mix of initial agreements and misses the gross tallies equal +// the net totals. A subsequent repair shifts the net totals but leaves +// the gross tallies unchanged, and the gross sum equals the number of +// reconciled ledgers (the additive invariant the _total counters rely on). +// --------------------------------------------------------------- +TEST_F(ValidationTrackerTest, GrossAgreementsCountInitialOnly) +{ + // 3 initial agreements: both sides validate. + for (int i = 1; i <= 3; ++i) + { + auto const h = makeHash(static_cast(i)); + tracker_.recordOurValidation(h, static_cast(i)); + tracker_.recordNetworkValidation(h, static_cast(i)); + } + + // 2 initial misses: only network validates. + for (int i = 4; i <= 5; ++i) + { + auto const h = makeHash(static_cast(i)); + tracker_.recordNetworkValidation(h, static_cast(i)); + } + + // Grace period elapses -- all five reconciled at first classification. + std::this_thread::sleep_for(std::chrono::seconds(9)); + tracker_.reconcile(); + + // Before any repair, gross equals net. + EXPECT_EQ(tracker_.totalAgreements(), 3u); + EXPECT_EQ(tracker_.totalAgreementsEver(), 3u); + EXPECT_EQ(tracker_.totalMissed(), 2u); + EXPECT_EQ(tracker_.totalMissedEver(), 2u); + + // Repair one of the misses (hash 4) within the repair window. + tracker_.recordOurValidation(makeHash(4), 4); + tracker_.reconcile(); + + // Net totals shift by the repair... + EXPECT_EQ(tracker_.totalAgreements(), 4u); + EXPECT_EQ(tracker_.totalMissed(), 1u); + // ...gross tallies stay at the initial classification. + EXPECT_EQ(tracker_.totalAgreementsEver(), 3u); + EXPECT_EQ(tracker_.totalMissedEver(), 2u); + + // Additive invariant: gross agree + gross miss == ledgers reconciled. + EXPECT_EQ(tracker_.totalAgreementsEver() + tracker_.totalMissedEver(), 5u); +} diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp index 8ca0c15889..bd51db3b51 100644 --- a/src/xrpld/telemetry/MetricsRegistry.cpp +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -244,10 +244,9 @@ MetricsRegistry::start(std::string const& endpoint, std::string const& instanceI "xrpld_txq_expired_total", "Total transactions expired out of the transaction queue"); txqDroppedCounter_ = meter_->CreateUInt64Counter( "xrpld_txq_dropped_total", "Total transactions refused admission to the queue by reason"); - validationAgreementsCounter_ = meter_->CreateUInt64Counter( - "xrpld_validation_agreements_total", "Total validation agreements"); - validationMissedCounter_ = - meter_->CreateUInt64Counter("xrpld_validation_missed_total", "Total validation misses"); + // Note: xrpld_validation_agreements_total / xrpld_validation_missed_total + // are monotonic ObservableCounters created in registerValidationTotalsCounters() + // (below), observed from ValidationTracker's gross lifetime tallies. // Register all observable (async) gauges. registerAsyncGauges(); @@ -441,6 +440,7 @@ MetricsRegistry::registerAsyncGauges() registerStateTrackingGauge(); registerStorageDetailGauge(); registerValidationAgreementGauge(); + registerValidationTotalsCounters(); } void @@ -1325,13 +1325,67 @@ MetricsRegistry::registerValidationAgreementGauge() } }, this); +} - // Note: validationAgreementsCounter_ and validationMissedCounter_ are - // created above but not currently incremented. The - // xrpld_validation_agreement gauge already provides agreement and miss - // counts from ValidationTracker's rolling windows and lifetime totals. - // These counters are reserved for future use if a push-style counter - // integration with ValidationTracker is desired. +void +MetricsRegistry::registerValidationTotalsCounters() +{ + // Lifetime validation agreement/miss counters. + // + // These are monotonic ObservableCounters (not the sync Counters they used + // to be): a Prometheus _total must never decrease, but ValidationTracker's + // NET totals are non-monotonic (a late repair decrements the net miss + // count). We therefore observe the tracker's GROSS lifetime tallies, which + // count each ledger once at first classification and are never adjusted on + // repair (initial-classification semantics — see ValidationTracker). The + // repaired/agreement view remains available from xrpld_validation_agreement. + // + // reconcile() is called first so pending events are resolved before the + // tallies are read; the callback fires every ~10 s from the + // PeriodicExportingMetricReader thread. + validationAgreementsObservable_ = meter_->CreateInt64ObservableCounter( + "xrpld_validation_agreements_total", + "Lifetime validations that initially agreed with network consensus"); + validationAgreementsObservable_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; + try + { + self->validationTracker_.reconcile(); + opentelemetry::nostd::get>>(result) + ->Observe(static_cast(self->validationTracker_.totalAgreementsEver())); + } + catch (...) // NOLINT(bugprone-empty-catch) + { + // Silently skip on error. + } + }, + this); + + validationMissedObservable_ = meter_->CreateInt64ObservableCounter( + "xrpld_validation_missed_total", + "Lifetime validations that initially missed network consensus"); + validationMissedObservable_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; + try + { + self->validationTracker_.reconcile(); + opentelemetry::nostd::get>>(result) + ->Observe(static_cast(self->validationTracker_.totalMissedEver())); + } + catch (...) // NOLINT(bugprone-empty-catch) + { + // Silently skip on error. + } + }, + this); } #endif // XRPL_ENABLE_TELEMETRY diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h index 63a240ef75..f0986b0b33 100644 --- a/src/xrpld/telemetry/MetricsRegistry.h +++ b/src/xrpld/telemetry/MetricsRegistry.h @@ -529,13 +529,16 @@ private: /// Counter: xrpld_txq_dropped_total{reason} — incremented when a transaction is refused /// admission to the queue. opentelemetry::nostd::unique_ptr> txqDroppedCounter_; - /// Counter: xrpld_validation_agreements_total — incremented by ValidationTracker on - /// agreement. - opentelemetry::nostd::unique_ptr> - validationAgreementsCounter_; - /// Counter: xrpld_validation_missed_total — incremented by ValidationTracker on miss. - opentelemetry::nostd::unique_ptr> - validationMissedCounter_; + /// ObservableCounter: xrpld_validation_agreements_total — observed from + /// ValidationTracker::totalAgreementsEver() (monotonic gross lifetime + /// tally, initial-classification semantics). + opentelemetry::nostd::shared_ptr + validationAgreementsObservable_; + /// ObservableCounter: xrpld_validation_missed_total — observed from + /// ValidationTracker::totalMissedEver() (monotonic gross lifetime tally, + /// initial-classification semantics). + opentelemetry::nostd::shared_ptr + validationMissedObservable_; /** Register all observable gauge callbacks with the OTel SDK. Dispatches to one helper per metric domain so that each helper @@ -580,6 +583,8 @@ private: registerStorageDetailGauge(); // Task 7.13 void registerValidationAgreementGauge(); // Task 7.15 + void + registerValidationTotalsCounters(); // gap-fill: lifetime agree/miss _total #endif // XRPL_ENABLE_TELEMETRY }; diff --git a/src/xrpld/telemetry/ValidationTracker.h b/src/xrpld/telemetry/ValidationTracker.h index dac2f9c706..301ad31fe0 100644 --- a/src/xrpld/telemetry/ValidationTracker.h +++ b/src/xrpld/telemetry/ValidationTracker.h @@ -186,6 +186,26 @@ public: uint64_t totalMissed() const; + /** Lifetime agreements counted at first classification only. + * @note Unlike totalAgreements(), this is strictly monotonic: it is + * incremented only when a ledger is first reconciled as an agreement and + * is never adjusted by a late repair. It backs the monotonic Prometheus + * counter xrpld_validation_agreements_total. See the counting-semantics + * note in detail/ValidationTracker.cpp. + */ + uint64_t + totalAgreementsEver() const; + + /** Lifetime misses counted at first classification only. + * @note Unlike totalMissed(), this is strictly monotonic: it is + * incremented only when a ledger is first reconciled as a miss and is + * never decremented by a late repair. It backs the monotonic Prometheus + * counter xrpld_validation_missed_total. See the counting-semantics note + * in detail/ValidationTracker.cpp. + */ + uint64_t + totalMissedEver() const; + /** Total validations this node sent. */ uint64_t totalValidationsSent() const; @@ -254,12 +274,33 @@ private: /// Sliding window of reconciled events (last 7 days). std::deque window7d_; - /// Lifetime count of agreements. + /// Lifetime count of agreements (net: incremented on agree, also on + /// repair). May be read via totalAgreements(); feeds the windowed gauge. std::atomic totalAgreements_{0}; - /// Lifetime count of misses. + /// Lifetime count of misses (net: incremented on miss, decremented on + /// repair). NON-monotonic. May be read via totalMissed(). std::atomic totalMissed_{0}; + // Monotonic "gross" lifetime tallies for the Prometheus _total counters. + // + // Counting decision (initial-classification only): each reconciled ledger + // is counted exactly once, at its first classification, into exactly one + // of the two tallies below. A later late-repair (miss -> agreement) does + // NOT move either tally. This keeps both strictly monotonic (a Prometheus + // _total must never decrease) and additive: + // totalAgreementsGross_ + totalMissedGross_ == ledgers reconciled. + // The repaired/agreement view is still available from the windowed gauge + // (xrpld_validation_agreement) and the net totals above. + + /// Monotonic lifetime initial agreements; backs + /// xrpld_validation_agreements_total. Never adjusted on repair. + std::atomic totalAgreementsGross_{0}; + + /// Monotonic lifetime initial misses; backs xrpld_validation_missed_total. + /// Never decremented on repair. + std::atomic totalMissedGross_{0}; + /// Lifetime count of validations this node sent. std::atomic totalValidationsSent_{0}; diff --git a/src/xrpld/telemetry/detail/ValidationTracker.cpp b/src/xrpld/telemetry/detail/ValidationTracker.cpp index 38e065d8b5..a3124100d0 100644 --- a/src/xrpld/telemetry/detail/ValidationTracker.cpp +++ b/src/xrpld/telemetry/detail/ValidationTracker.cpp @@ -63,10 +63,16 @@ ValidationTracker::reconcile() if (evt.agreed) { totalAgreements_.fetch_add(1, std::memory_order_relaxed); + // Gross tally: count the initial agreement once. See the + // counting-decision note below (repair branch). + totalAgreementsGross_.fetch_add(1, std::memory_order_relaxed); } else { totalMissed_.fetch_add(1, std::memory_order_relaxed); + // Gross tally: count the initial miss once. See the + // counting-decision note below (repair branch). + totalMissedGross_.fetch_add(1, std::memory_order_relaxed); } WindowEvent const we{.time = now, .ledgerHash = evt.ledgerHash, .agreed = evt.agreed}; @@ -78,11 +84,20 @@ ValidationTracker::reconcile() evt.reconciled && !evt.agreed && evt.weValidated && evt.networkValidated && (now - evt.recordTime) <= kLateRepairWindow) { - // Late repair: was a miss, now both flags set. + // Late repair: was a miss, now both flags set. Adjust the NET + // totals (used by the windowed agreement gauge) so the live view + // reflects the repair. evt.agreed = true; totalMissed_.fetch_sub(1, std::memory_order_relaxed); totalAgreements_.fetch_add(1, std::memory_order_relaxed); + // Counting decision (initial-classification only): the gross + // tallies (totalAgreementsGross_ / totalMissedGross_) that back the + // monotonic Prometheus _total counters are deliberately NOT touched + // here. Each ledger is counted once, at first classification; a + // repair must not decrement missed (a _total may never decrease) + // nor add a second agreement (which would double-count the ledger). + // Flip the corresponding window entries from miss to agreement. repairWindowEntry(window1h_, evt.ledgerHash); repairWindowEntry(window24h_, evt.ledgerHash); @@ -253,6 +268,18 @@ ValidationTracker::totalMissed() const return totalMissed_.load(std::memory_order_relaxed); } +uint64_t +ValidationTracker::totalAgreementsEver() const +{ + return totalAgreementsGross_.load(std::memory_order_relaxed); +} + +uint64_t +ValidationTracker::totalMissedEver() const +{ + return totalMissedGross_.load(std::memory_order_relaxed); +} + uint64_t ValidationTracker::totalValidationsSent() const { From f7df1742fb00dc6361f4d218fdf2b22b18fef5a0 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 18:40:58 +0100 Subject: [PATCH 02/10] fix(telemetry): drop bool close_time_correct filter from close-time panels The five Close Time panels still rendered "No Data" after the metrics rewrite. Root cause: each query carried `span.close_time_correct=~"$close_time_correct"`, but close_time_correct is a boolean span attribute and TraceQL's regex match (=~) against a bool matches nothing in a metrics query, so every panel returned an empty series set (HTTP 200, {"series":[]}). Remove that filter clause. The panels do not break down by close_time_correct, so dropping it restores data without losing any dimension. The $node filter (a string attribute) is unaffected and stays. Verified via the Grafana datasource proxy that all six targets now return series. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index c5baf674ca..239c7aed9f 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -429,7 +429,7 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | quantile_over_time(span.close_time_self, .5) by (resource.service.instance.id)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | quantile_over_time(span.close_time_self, .5) by (resource.service.instance.id)", "legendFormat": "{{service.instance.id}}", "refId": "A" } @@ -474,7 +474,7 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | quantile_over_time(span.close_time, .5) by (resource.service.instance.id)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | quantile_over_time(span.close_time, .5) by (resource.service.instance.id)", "legendFormat": "{{service.instance.id}}", "refId": "A" } @@ -552,7 +552,7 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | avg_over_time(span.close_time_vote_bins)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.close_time_vote_bins)", "legendFormat": "Avg Vote Bins", "refId": "A" }, @@ -562,7 +562,7 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | avg_over_time(span.close_resolution_ms)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.close_resolution_ms)", "legendFormat": "Avg Resolution (ms)", "refId": "B" } @@ -607,7 +607,7 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\" && span.resolution_direction=~\"$resolution_direction\"} | count_over_time() by (span.resolution_direction)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.resolution_direction=~\"$resolution_direction\"} | count_over_time() by (span.resolution_direction)", "legendFormat": "{{span.resolution_direction}}", "refId": "A" } @@ -653,7 +653,7 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | count_over_time() by (span.close_time_vote_bins)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | count_over_time() by (span.close_time_vote_bins)", "legendFormat": "{{span.close_time_vote_bins}} bins", "refId": "A" } From 283218896bd56a031a62d53605d20c07916c82c6 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 18:48:31 +0100 Subject: [PATCH 03/10] fix(telemetry): use avg not quantile for close-time value panels The Raw Proposals and Effective/Quantized panels showed wrong values (e.g. 759M, 852M, even 0) against a true value of ~834M. Cause: quantile_over_time bucketizes into an exponential histogram tuned for duration distributions, so it cannot represent large absolute integers (Ripple-epoch seconds) accurately. Switch both panels to avg_over_time, which returns the correct value (verified ~833,996,7xx matching the raw span attribute). Average is also the semantically right aggregation here: close time is a single agreed value per consensus round, not a latency distribution, so a median was never meaningful. Set the unit to none rather than seconds: the value is Ripple-epoch seconds (Unix = value + 946684800) and TraceQL metrics cannot do the offset arithmetic in-query, so a duration unit would misrender it. Clarify in the description that the absolute level tracks wall-clock and the useful signal is per-node spread / raw-vs-effective gap. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 239c7aed9f..04c4addce7 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -392,7 +392,7 @@ }, { "title": "Close Time: Raw Proposals (Per Node)", - "description": "Each node's raw proposed close time (close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift. Values are Ripple-epoch seconds (since 2000-01-01).", + "description": "Each node's raw proposed close time (close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift. Value is Ripple-epoch seconds (Unix = value + 946684800); it tracks wall-clock so the absolute number is large and near-constant \u2014 watch per-node spread and raw-vs-effective gap, not the absolute level.", "type": "timeseries", "gridPos": { "h": 8, @@ -402,7 +402,7 @@ }, "fieldConfig": { "defaults": { - "unit": "s", + "unit": "none", "custom": { "drawStyle": "points", "pointSize": 6, @@ -429,7 +429,7 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | quantile_over_time(span.close_time_self, .5) by (resource.service.instance.id)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.close_time_self) by (resource.service.instance.id)", "legendFormat": "{{service.instance.id}}", "refId": "A" } @@ -437,7 +437,7 @@ }, { "title": "Close Time: Effective / Quantized", - "description": "The consensus-agreed close time after rounding to the current resolution bin (close_time). This is the value written to the ledger header. All nodes in agreement produce the same value. Values are Ripple-epoch seconds (since 2000-01-01).", + "description": "The consensus-agreed close time after rounding to the current resolution bin (close_time). This is the value written to the ledger header. All nodes in agreement produce the same value. Value is Ripple-epoch seconds (Unix = value + 946684800); it tracks wall-clock so the absolute number is large and near-constant \u2014 watch per-node spread and raw-vs-effective gap, not the absolute level.", "type": "timeseries", "gridPos": { "h": 8, @@ -447,7 +447,7 @@ }, "fieldConfig": { "defaults": { - "unit": "s", + "unit": "none", "custom": { "drawStyle": "points", "pointSize": 6, @@ -474,7 +474,7 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | quantile_over_time(span.close_time, .5) by (resource.service.instance.id)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.close_time) by (resource.service.instance.id)", "legendFormat": "{{service.instance.id}}", "refId": "A" } From cfb2d87cab808d4d49a56e5239802f6fb6d0e857 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 18:59:57 +0100 Subject: [PATCH 04/10] fix(telemetry): correct close-time legend label key to resource.service.instance.id The Raw Proposals and Effective/Quantized panels rendered nameless series: their legendFormat used {{service.instance.id}}, but the TraceQL metrics query groups by resource.service.instance.id and Tempo returns that full key as the series label. The legend token did not match any label, so each series showed blank. Use the matching {{resource.service.instance.id}} token. Verified via the Grafana datasource proxy that all six close-time panels now return correctly-labelled series. Co-Authored-By: Claude Opus 4.8 --- docker/telemetry/grafana/dashboards/consensus-health.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 04c4addce7..37ed4469e9 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -430,7 +430,7 @@ }, "queryType": "traceql", "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.close_time_self) by (resource.service.instance.id)", - "legendFormat": "{{service.instance.id}}", + "legendFormat": "{{resource.service.instance.id}}", "refId": "A" } ] @@ -475,7 +475,7 @@ }, "queryType": "traceql", "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.close_time) by (resource.service.instance.id)", - "legendFormat": "{{service.instance.id}}", + "legendFormat": "{{resource.service.instance.id}}", "refId": "A" } ] From 8dcd6f9b4a5ba4120c2635a23e2b70f38d2b8bb1 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:19:55 +0100 Subject: [PATCH 05/10] fix(telemetry): raise TraceQL metrics max_duration to 168h TraceQL metrics queries default to a 3h max range (query_frontend.metrics.max_duration), so a dashboard set to a longer window failed with "range ... exceeds 3h0m0s". Add a query_frontend block raising it to 168h, matching the search max_duration, so the consensus close-time panels work at 6h/12h/24h ranges. Co-Authored-By: Claude Opus 4.8 --- docker/telemetry/tempo.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docker/telemetry/tempo.yaml b/docker/telemetry/tempo.yaml index b2997bda53..24c092ca95 100644 --- a/docker/telemetry/tempo.yaml +++ b/docker/telemetry/tempo.yaml @@ -17,6 +17,14 @@ stream_over_http_enabled: true server: http_listen_port: 3200 +# Raise the TraceQL metrics query range limit. The default +# query_frontend.metrics.max_duration is 3h, so a dashboard set to a longer +# window (e.g. 6h/12h) fails with "range exceeds 3h0m0s". 168h matches the +# search max_duration and gives dashboards generous headroom. +query_frontend: + metrics: + max_duration: 168h + distributor: receivers: otlp: From 5c275ac4762b1a2d5e529d80627747a68dcbc0a4 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:21:59 +0100 Subject: [PATCH 06/10] fix(telemetry): set units, axis labels, and readable legends on close-time panels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the dashboard guidelines to the five close-time panels: - Axis labels (Title Case) on every panel: "Close Time (Ripple Seconds)" for the value panels, "Count / Milliseconds" for vote bins/resolution, "Rounds in Window" for the count panels. - Human-readable legends with the dimension in brackets per the legend convention: "Raw Close Time [{{resource.service.instance.id}}]", "Effective Close Time [...]", "Resolution Direction [{{span.resolution_direction}}]", "{{span.close_time_vote_bins}} Vote Bins" — replacing the bare label tokens. - Unit "none" (plain number): the close-time values are Ripple-epoch seconds and TraceQL metrics cannot offset them to a wall-clock unit, and the others are counts/ms on a shared axis. Verified rendered values against raw spans: close times ~833,998,8xx, resolution 10000 ms, vote bins 1/2/3 — all correct. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 37ed4469e9..e9545e9d0b 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -406,7 +406,8 @@ "custom": { "drawStyle": "points", "pointSize": 6, - "showPoints": "always" + "showPoints": "always", + "axisLabel": "Close Time (Ripple Seconds)" } }, "overrides": [] @@ -430,7 +431,7 @@ }, "queryType": "traceql", "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.close_time_self) by (resource.service.instance.id)", - "legendFormat": "{{resource.service.instance.id}}", + "legendFormat": "Raw Close Time [{{resource.service.instance.id}}]", "refId": "A" } ] @@ -451,7 +452,8 @@ "custom": { "drawStyle": "points", "pointSize": 6, - "showPoints": "always" + "showPoints": "always", + "axisLabel": "Close Time (Ripple Seconds)" } }, "overrides": [] @@ -475,7 +477,7 @@ }, "queryType": "traceql", "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.close_time) by (resource.service.instance.id)", - "legendFormat": "{{resource.service.instance.id}}", + "legendFormat": "Effective Close Time [{{resource.service.instance.id}}]", "refId": "A" } ] @@ -496,8 +498,10 @@ "drawStyle": "line", "lineInterpolation": "stepAfter", "pointSize": 5, - "showPoints": "auto" - } + "showPoints": "auto", + "axisLabel": "Count / Milliseconds" + }, + "unit": "none" }, "overrides": [ { @@ -584,8 +588,10 @@ "drawStyle": "bars", "fillOpacity": 40, "pointSize": 5, - "showPoints": "auto" - } + "showPoints": "auto", + "axisLabel": "Rounds in Window" + }, + "unit": "none" }, "overrides": [] }, @@ -608,7 +614,7 @@ }, "queryType": "traceql", "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.resolution_direction=~\"$resolution_direction\"} | count_over_time() by (span.resolution_direction)", - "legendFormat": "{{span.resolution_direction}}", + "legendFormat": "Resolution Direction [{{span.resolution_direction}}]", "refId": "A" } ] @@ -625,9 +631,10 @@ }, "fieldConfig": { "defaults": { - "unit": "short", + "unit": "none", "custom": { - "fillOpacity": 60 + "fillOpacity": 60, + "axisLabel": "Rounds in Window" } }, "overrides": [] @@ -654,7 +661,7 @@ }, "queryType": "traceql", "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\"} | count_over_time() by (span.close_time_vote_bins)", - "legendFormat": "{{span.close_time_vote_bins}} bins", + "legendFormat": "{{span.close_time_vote_bins}} Vote Bins", "refId": "A" } ] From 4a8aa9e514c06fcfb02b869eacbfda972c6adf10 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:28:45 +0100 Subject: [PATCH 07/10] docs(telemetry): reconcile 09-data-collection-reference span/attribute inventory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The §1 span and attribute inventory had regressed to an older 16-span snapshot that uses the pre-2026-05-13 dotted attribute keys, while phase-7's code emits ~36 spans with bare/underscore attribute keys. The §Data Flow Overview and §2 System Metrics sections (native OTLP transport — phase-7's migration) were already correct and are left unchanged. - §1.1: expand the span inventory to the full surface — add gRPC (grpc.), TxQ (txq.*), PathFind (pathfind.*), and the full consensus set (round/phase.open/ establish/update_positions/check/mode_change/proposal.receive/validation.receive). Fix the phantom rpc.request -> rpc.http_request, add rpc.ws_upgrade. No grpc.request, no pathfind.rank, no ledger.acquire (the latter is added in phase-9, not yet present here). - §1.2: convert every span-attribute key from dotted xrpl.. to the bare/underscore form. The sole span-attr dotted exception is xrpl.ledger.hash on peer.validation.receive (shared constant); consensus.validation.send uses bare ledger_hash. Resource attrs xrpl.network.id/type stay dotted. Fix tx_count/tx_failed placement (on tx.apply, not ledger.build). Add attribute tables for the new families. - §1.3: list the full set of spanmetrics dimension labels (bare keys, from the collector config) instead of the stale xrpl_rpc_command-style names. - §4/§5: convert Tempo TraceQL and PromQL examples to the bare attribute/label forms. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../09-data-collection-reference.md | 391 +++++++++++++----- 1 file changed, 291 insertions(+), 100 deletions(-) diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md index 56817b215e..9e15e7b28c 100644 --- a/OpenTelemetryPlan/09-data-collection-reference.md +++ b/OpenTelemetryPlan/09-data-collection-reference.md @@ -76,22 +76,45 @@ There are two independent telemetry pipelines entering a single **OTel Collector ## 1. OpenTelemetry Spans -### 1.1 Complete Span Inventory (16 spans) +### 1.1 Complete Span Inventory (~36 spans) > **See also**: [02-design-decisions.md §2.3](./02-design-decisions.md#23-span-naming-conventions) for naming conventions and the full span catalog with rationale. [04-code-samples.md §4.6](./04-code-samples.md#46-span-flow-visualization) for span flow diagrams. +> **Span names vs. attribute keys**: span names use dotted `subsystem.operation` +> form (e.g. `rpc.http_request`). Span _attribute_ keys use the bare/underscore +> form from the 2026-05-13 naming redesign (e.g. `tx_hash`, not `xrpl.tx.hash`). +> The dotted `xrpl.*` form is reserved for OTel **resource** attributes set once +> at startup. See §1.2 for the full attribute inventory. + #### RPC Spans Controlled by `trace_rpc=1` in `[telemetry]` config. -| Span Name | Parent | Source File | Description | -| -------------------- | ------------- | ----------------- | ------------------------------------------------------------------------ | -| `rpc.request` | — | ServerHandler.cpp | Top-level HTTP RPC request entry point | -| `rpc.process` | `rpc.request` | ServerHandler.cpp | RPC processing pipeline | -| `rpc.ws_message` | — | ServerHandler.cpp | WebSocket message handling | -| `rpc.command.` | `rpc.process` | RPCHandler.cpp | Per-command span (e.g., `rpc.command.server_info`, `rpc.command.ledger`) | +| Span Name | Parent | Source File | Description | +| -------------------- | ------------------ | ----------------- | ------------------------------------------------------------------------ | +| `rpc.http_request` | — | ServerHandler.cpp | Top-level HTTP JSON-RPC request entry point | +| `rpc.ws_message` | — | ServerHandler.cpp | WebSocket message handling (one per inbound frame) | +| `rpc.ws_upgrade` | — | ServerHandler.cpp | WebSocket upgrade handshake (records handshake failures) | +| `rpc.process` | `rpc.http_request` | ServerHandler.cpp | RPC processing pipeline (single or batch request) | +| `rpc.command.` | `rpc.process` | RPCHandler.cpp | Per-command span (e.g., `rpc.command.server_info`, `rpc.command.ledger`) | -**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"rpc.request|rpc.command.*"}` +**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"rpc.http_request|rpc.command.*"}` + +**Grafana dashboard**: _RPC Performance_ (`xrpld-rpc-perf`) + +#### gRPC Spans + +Controlled by `trace_rpc=1` in `[telemetry]` config. + +| Span Name | Parent | Source File | Description | +| ------------------- | ------ | -------------- | ------------------------------------------------------------------------------------------------------------------------- | +| `grpc.` | — | GRPCServer.cpp | One flat span per gRPC method (e.g., `grpc.GetLedger`, `grpc.GetLedgerData`, `grpc.GetLedgerDiff`, `grpc.GetLedgerEntry`) | + +The method name is embedded in the span name (formed at the call site as +`grpc.`), so dashboards break out per-method latency and error +rates without TraceQL attribute filters. + +**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"grpc.*"}` **Grafana dashboard**: _RPC Performance_ (`xrpld-rpc-perf`) @@ -119,17 +142,46 @@ or, for the apply pipeline: `{resource.service.name="xrpld" && name=~"tx.preflig **Grafana dashboard**: _Transaction Overview_ (`xrpld-transactions`) +#### Transaction Queue (TxQ) Spans + +Controlled by `trace_transactions=1` in `[telemetry]` config. + +| Span Name | Parent | Source File | Description | +| ------------------ | ------------- | ----------- | --------------------------------------------------- | +| `txq.enqueue` | `tx.process` | TxQ.cpp | Enqueue decision when a tx is submitted | +| `txq.apply_direct` | `txq.enqueue` | TxQ.cpp | Direct apply attempt that bypasses the queue | +| `txq.batch_clear` | `txq.enqueue` | TxQ.cpp | Batch clear of an account's queued txs | +| `txq.accept` | — | TxQ.cpp | Ledger-close accept loop (drains the queue) | +| `txq.accept.tx` | `txq.accept` | TxQ.cpp | Per-queued-transaction apply inside the accept loop | +| `txq.cleanup` | — | TxQ.cpp | Post-close cleanup of expired queue entries | + +**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"txq.*"}` + +**Grafana dashboard**: _Transaction Overview_ (`xrpld-transactions`) + #### Consensus Spans Controlled by `trace_consensus=1` in `[telemetry]` config. -| Span Name | Parent | Source File | Description | -| --------------------------- | ------ | ---------------- | --------------------------------------------- | -| `consensus.proposal.send` | — | RCLConsensus.cpp | Node broadcasts its transaction set proposal | -| `consensus.ledger_close` | — | RCLConsensus.cpp | Ledger close event triggered by consensus | -| `consensus.accept` | — | RCLConsensus.cpp | Consensus accepts a ledger (round complete) | -| `consensus.validation.send` | — | RCLConsensus.cpp | Validation message sent after ledger accepted | -| `consensus.accept.apply` | — | RCLConsensus.cpp | Ledger application with close time details | +| Span Name | Parent | Source File | Description | +| ------------------------------ | ------------------ | ---------------- | ------------------------------------------------------------------- | +| `consensus.round` | — (root) | RCLConsensus.cpp | Root span for one consensus round (deterministic trace per round) | +| `consensus.phase.open` | `consensus.round` | Consensus.h | Open phase — collecting transactions before close | +| `consensus.proposal.send` | `consensus.round` | RCLConsensus.cpp | Node broadcasts its transaction set proposal | +| `consensus.ledger_close` | `consensus.round` | RCLConsensus.cpp | Ledger close event triggered by consensus | +| `consensus.establish` | `consensus.round` | Consensus.h | Establish phase — converging on the transaction set | +| `consensus.update_positions` | `consensus.round` | Consensus.h | Position update with per-dispute vote details | +| `consensus.check` | `consensus.round` | Consensus.h | Consensus threshold check (agree/disagree tally) | +| `consensus.accept` | `consensus.round` | RCLConsensus.cpp | Consensus accepts a ledger (round complete) | +| `consensus.accept.apply` | `consensus.accept` | RCLConsensus.cpp | Ledger application with close-time details (jtACCEPT thread) | +| `consensus.validation.send` | `consensus.round` | RCLConsensus.cpp | Validation message sent after ledger accepted (follows-from link) | +| `consensus.mode_change` | `consensus.round` | RCLConsensus.cpp | Operating-mode transition during the round | +| `consensus.proposal.receive` | (context) | PeerImp.cpp | Proposal received from a peer (context-propagated into the round) | +| `consensus.validation.receive` | (context) | PeerImp.cpp | Validation received from a peer (context-propagated into the round) | + +The `.receive` spans are created per-message in the overlay and joined to the +round trace via context propagation rather than direct parenting. The +`consensus.validation.send` span uses a follows-from link off the round. **Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"consensus.*"}` @@ -162,88 +214,201 @@ Controlled by `trace_peer=1` in `[telemetry]` config. **Disabled by default** (h **Grafana dashboard**: _Peer Network_ (`xrpld-peer-net`) +#### PathFind Spans + +Controlled by `trace_rpc=1` in `[telemetry]` config. + +| Span Name | Parent | Source File | Description | +| --------------------- | ------------------ | --------------- | ---------------------------------------------------------- | +| `pathfind.request` | `rpc.command.*` | PathRequest.cpp | `path_find` / `ripple_path_find` RPC entry | +| `pathfind.compute` | `pathfind.request` | PathRequest.cpp | Path computation for one request (`PathRequest::doUpdate`) | +| `pathfind.discover` | `pathfind.compute` | Pathfinder.cpp | Graph exploration (one per RPC call) | +| `pathfind.update_all` | — | PathRequest.cpp | Async recomputation of all active requests at ledger close | + +**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"pathfind.*"}` + --- -### 1.2 Complete Attribute Inventory (22 attributes) +### 1.2 Complete Attribute Inventory (bare/underscore keys) > **See also**: [02-design-decisions.md §2.4.2](./02-design-decisions.md#242-span-attributes-by-category) for attribute design rationale and privacy considerations. -Every span can carry key-value attributes that provide context for filtering and aggregation. +Every span can carry key-value attributes that provide context for filtering and +aggregation. Per the 2026-05-13 naming redesign, span-attribute keys use the +**bare** field name (the span name already carries the domain), or the +`_` underscore form where a bare name would collide (e.g. +`rpc_status`, `grpc_status`, `tx_status`, `txq_status`). + +> **Dotted exceptions** (do not confuse with span attributes): +> +> - `xrpl.ledger.hash` is the **only** dotted span attribute. It is a shared +> constant set on `peer.validation.receive`. Note that `consensus.validation.send` +> uses the **bare** `ledger_hash` instead. +> - `xrpl.network.id` and `xrpl.network.type` are **resource** attributes set +> once at startup on the OTel resource — not span attributes. They appear on +> every span's resource scope, queried as `{resource.xrpl.network.id=...}`. #### RPC Attributes -| Attribute | Type | Set On | Description | -| --------------- | ------ | --------------- | ------------------------------------------------ | -| `command` | string | `rpc.command.*` | RPC command name (e.g., `server_info`, `ledger`) | -| `version` | int64 | `rpc.command.*` | API version number | -| `rpc_role` | string | `rpc.command.*` | Caller role: `"admin"` or `"user"` | -| `rpc_status` | string | `rpc.command.*` | Result: `"success"` or `"error"` | -| `duration_ms` | int64 | `rpc.command.*` | Command execution time in milliseconds | -| `error_message` | string | `rpc.command.*` | Error details (only set on failure) | +| Attribute | Type | Set On | Description | +| ---------------------- | ------- | --------------------------------- | ------------------------------------------------ | +| `command` | string | `rpc.command.*`, `rpc.ws_message` | RPC command name (e.g., `server_info`, `ledger`) | +| `version` | int64 | `rpc.command.*` | API version number | +| `rpc_role` | string | `rpc.command.*` | Caller role: `"admin"` or `"user"` | +| `rpc_status` | string | `rpc.command.*` | Result: `"success"` or `"error"` | +| `request_payload_size` | int64 | `rpc.http_request` | Bytes of inbound request payload | +| `is_batch` | boolean | `rpc.process` | `true` if the request is a JSON-RPC batch | +| `batch_size` | int64 | `rpc.process` | Number of sub-requests in a batch | +| `load_type` | string | `rpc.command.*` | Resource cost category after execution | **Tempo query**: `{span.command="server_info"}` to find all `server_info` calls. -**Prometheus label**: `xrpl_rpc_command` (dots converted to underscores by SpanMetrics). +**Prometheus label**: `command` (used as a SpanMetrics dimension). + +#### gRPC Attributes + +| Attribute | Type | Set On | Description | +| ------------- | ------ | ------------------- | ------------------------------------ | +| `method` | string | `grpc.` | gRPC method name (e.g., `GetLedger`) | +| `grpc_role` | string | `grpc.` | Caller role: `"admin"` or `"user"` | +| `grpc_status` | string | `grpc.` | Result: `"success"` or `"error"` | + +**Tempo query**: `{span.method="GetLedger"}` or `{name="grpc.GetLedger"}`. + +**Prometheus labels**: `method`, `grpc_role`, `grpc_status` (SpanMetrics dimensions). #### Transaction Attributes -| Attribute | Type | Set On | Description | -| ------------------- | ------- | ---------------------------------------------- | --------------------------------------------------------------------- | -| `xrpl.tx.hash` | string | `tx.process`, `tx.receive` | Transaction hash (hex-encoded) | -| `local` | boolean | `tx.process` | `true` if locally submitted, `false` if peer-relayed | -| `path` | string | `tx.process` | Submission path: `"sync"` or `"async"` | -| `suppressed` | boolean | `tx.receive` | `true` if transaction was suppressed (duplicate) | -| `tx_status` | string | `tx.receive` | Transaction status (e.g., `"known_bad"`) | -| `xrpl.peer.id` | int64 | `tx.receive` | Peer identifier (also set on peer spans) | -| `xrpl.peer.version` | string | `tx.receive` | Peer protocol version string | -| `stage` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Apply-pipeline stage: `preflight`, `preclaim`, or `apply` | -| `tx_type` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Transaction type name (e.g., `Payment`) | -| `ter_result` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Engine result token for that stage (e.g., `tesSUCCESS`, `terPRE_SEQ`) | -| `applied` | boolean | `tx.transactor` | `true` if the transaction was applied to the ledger | +| Attribute | Type | Set On | Description | +| -------------- | ------- | ------------------------------------------------------------ | --------------------------------------------------------------------- | +| `tx_hash` | string | `tx.process`, `tx.receive` | Transaction hash (hex-encoded) | +| `local` | boolean | `tx.process` | `true` if locally submitted, `false` if peer-relayed | +| `path` | string | `tx.process` | Submission path: `"sync"` or `"async"` | +| `tx_type` | string | `tx.process`, `tx.preflight`, `tx.preclaim`, `tx.transactor` | Transaction type name (e.g., `Payment`) | +| `fee` | int64 | `tx.process` | Transaction fee in drops | +| `sequence` | int64 | `tx.process` | Transaction sequence number | +| `suppressed` | boolean | `tx.receive` | `true` if transaction was suppressed (duplicate) | +| `tx_status` | string | `tx.receive` | Transaction status (e.g., `"known_bad"`) | +| `peer_id` | int64 | `tx.receive` | Peer identifier (also set on peer spans) | +| `peer_version` | string | `tx.receive` | Peer protocol version string | +| `stage` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Apply-pipeline stage: `preflight`, `preclaim`, or `apply` | +| `ter_result` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | Engine result token for that stage (e.g., `tesSUCCESS`, `terPRE_SEQ`) | +| `applied` | boolean | `tx.transactor` | `true` if the transaction was applied to the ledger | -**Tempo query**: `{span.xrpl.tx.hash=""}` to trace a specific transaction across nodes. +**Tempo query**: `{span.tx_hash=""}` to trace a specific transaction across nodes. -**Prometheus label**: `xrpl_tx_local` (used as SpanMetrics dimension). +**Prometheus labels**: `local`, `suppressed`, `tx_type`, `ter_result`, `stage` (SpanMetrics dimensions). + +#### Transaction Queue (TxQ) Attributes + +| Attribute | Type | Set On | Description | +| -------------------- | ------- | ------------------------------ | ----------------------------------------------------------- | +| `tx_hash` | string | `txq.enqueue`, `txq.accept.tx` | Transaction hash | +| `tx_type` | string | `txq.enqueue` | Transaction type name | +| `txq_status` | string | `txq.enqueue`, `txq.accept.tx` | Queue outcome (e.g. `queued`, `applied_direct`, `rejected`) | +| `fee_level_paid` | int64 | `txq.enqueue` | Fee level paid by the queued tx | +| `required_fee_level` | int64 | `txq.enqueue` | Minimum fee level for inclusion | +| `num_cleared` | int64 | `txq.batch_clear` | Entries cleared in a batch | +| `queue_size` | int64 | `txq.accept` | Current TxQ depth | +| `ledger_changed` | boolean | `txq.accept` | Whether the ledger changed since last attempt | +| `ter_code` | int64 | `txq.accept.tx` | Transaction engine result code | +| `retries_remaining` | int64 | `txq.accept.tx` | Retries left before discard | +| `ledger_seq` | int64 | `txq.cleanup` | Ledger sequence number | +| `expired_count` | int64 | `txq.cleanup` | Number of expired entries cleared | + +**Prometheus label**: `txq_status` (SpanMetrics dimension). #### Consensus Attributes -| Attribute | Type | Set On | Description | -| ------------------------------------ | ------- | --------------------------------------------------------------------------------------------------- | ------------------------------------------------------------- | -| `xrpl.consensus.round` | int64 | `consensus.proposal.send` | Consensus round number | -| `xrpl.consensus.mode` | string | `consensus.proposal.send`, `consensus.ledger_close` | Node mode: `"syncing"`, `"tracking"`, `"full"`, `"proposing"` | -| `xrpl.consensus.proposers` | int64 | `consensus.proposal.send`, `consensus.accept` | Number of proposers in the round | -| `xrpl.consensus.proposing` | boolean | `consensus.validation.send` | Whether this node was a proposer | -| `xrpl.consensus.ledger.seq` | int64 | `consensus.ledger_close`, `consensus.accept`, `consensus.validation.send`, `consensus.accept.apply` | Ledger sequence number | -| `xrpl.consensus.close_time` | int64 | `consensus.accept.apply` | Agreed-upon ledger close time (epoch seconds) | -| `xrpl.consensus.close_time_correct` | boolean | `consensus.accept.apply` | Whether validators reached agreement on close time | -| `xrpl.consensus.close_resolution_ms` | int64 | `consensus.accept.apply` | Close time rounding granularity in milliseconds | -| `xrpl.consensus.state` | string | `consensus.accept.apply` | Consensus outcome: `"finished"` or `"moved_on"` | -| `xrpl.consensus.round_time_ms` | int64 | `consensus.accept.apply` | Total consensus round duration in milliseconds | +| Attribute | Type | Set On | Description | +| -------------------------- | ------- | -------------------------------------------------------------------------------------------------- | -------------------------------------------------------- | +| `consensus_ledger_id` | string | `consensus.round` | Previous-ledger id anchoring the round | +| `ledger_seq` | int64 | `consensus.round`, `consensus.ledger_close`, `consensus.accept.apply`, `consensus.validation.send` | Ledger sequence number | +| `consensus_mode` | string | `consensus.round`, `consensus.ledger_close` | Node mode: `"Proposing"`, `"Observing"`, `"Wrong"`, etc. | +| `consensus_round_id` | int64 | `consensus.round` | Round identifier | +| `consensus_phase` | string | `consensus.round` | Current phase name (updated on each transition) | +| `trace_strategy` | string | `consensus.round` | Trace-id strategy (`deterministic` / `random`) | +| `previous_ledger_seq` | int64 | `consensus.round` | Sequence of the previous ledger | +| `previous_proposers` | int64 | `consensus.round` | Proposer count in the previous round | +| `previous_round_time_ms` | int64 | `consensus.round` | Duration of the previous round | +| `consensus_round` | int64 | `consensus.proposal.send` | Proposal sequence number for the broadcast proposal | +| `is_bow_out` | boolean | `consensus.proposal.send` | Whether the proposal is a bow-out (resigning the round) | +| `tx_count_open` | int64 | `consensus.ledger_close` | Transactions in the open ledger at close | +| `close_time_resolution_ms` | int64 | `consensus.ledger_close` | Close-time rounding granularity | +| `converge_percent` | int64 | `consensus.establish`, `consensus.update_positions` | Convergence percentage | +| `establish_count` | int64 | `consensus.establish` | Establish-phase iteration count | +| `proposers` | int64 | `consensus.establish`, `consensus.update_positions`, `consensus.accept` | Number of proposers | +| `disputes_count` | int64 | `consensus.establish`, `consensus.update_positions` | Number of disputed transactions | +| `tx_id` | string | `consensus.update_positions` | Disputed transaction id (per-dispute event) | +| `dispute_our_vote` | boolean | `consensus.update_positions` | Our vote on the disputed tx | +| `dispute_yays` | int64 | `consensus.update_positions` | Yes votes on the disputed tx | +| `dispute_nays` | int64 | `consensus.update_positions` | No votes on the disputed tx | +| `agree_count` | int64 | `consensus.check` | Agreeing proposer count | +| `disagree_count` | int64 | `consensus.check` | Disagreeing proposer count | +| `threshold_percent` | int64 | `consensus.check` | Agreement threshold percentage | +| `consensus_result` | string | `consensus.check` | Check outcome | +| `quorum` | int64 | `consensus.check`, `consensus.accept` | Quorum required | +| `round_time_ms` | int64 | `consensus.accept`, `consensus.accept.apply` | Total consensus round duration in milliseconds | +| `consensus_state` | string | `consensus.accept.apply` | Consensus outcome: `"finished"` or `"moved_on"` | +| `close_time` | int64 | `consensus.accept.apply` | Agreed-upon ledger close time (epoch seconds) | +| `close_time_correct` | boolean | `consensus.accept.apply` | Whether validators agreed on close time | +| `close_resolution_ms` | int64 | `consensus.accept.apply` | Close-time rounding granularity in milliseconds | +| `proposing` | boolean | `consensus.accept.apply`, `consensus.validation.send` | Whether this node was a proposer | +| `parent_close_time` | int64 | `consensus.accept.apply` | Parent ledger close time | +| `close_time_self` | int64 | `consensus.accept.apply` | This node's close-time vote | +| `close_time_vote_bins` | string | `consensus.accept.apply` | Distribution of close-time votes | +| `resolution_direction` | string | `consensus.accept.apply` | Whether close resolution increased/decreased/unchanged | +| `tx_count` | int64 | `consensus.accept.apply` | Transactions in the accepted set | +| `ledger_hash` | string | `consensus.validation.send` | Full hash of the validated ledger (**bare**, not dotted) | +| `full_validation` | boolean | `consensus.validation.send` | Whether this is a full validation | +| `validation_sign_time` | int64 | `consensus.validation.send` | Validation signing time | +| `mode_old` | string | `consensus.mode_change` | Operating mode before the transition | +| `mode_new` | string | `consensus.mode_change` | Operating mode after the transition | -**Tempo query**: `{span.xrpl.consensus.mode="proposing"}` to find rounds where node was proposing. +**Tempo query**: `{span.consensus_mode="Proposing"}` to find rounds where the node was proposing. -**Prometheus label**: `xrpl_consensus_mode` (used as SpanMetrics dimension). +**Prometheus labels**: `consensus_mode`, `consensus_state`, `consensus_phase`, `consensus_result`, `consensus_stalled`, `mode_new`, `close_time_correct` (SpanMetrics dimensions). #### Ledger Attributes -| Attribute | Type | Set On | Description | -| ------------------------- | ----- | ------------------------------------------------------------- | ---------------------------------------------- | -| `xrpl.ledger.seq` | int64 | `ledger.build`, `ledger.validate`, `ledger.store`, `tx.apply` | Ledger sequence number | -| `xrpl.ledger.validations` | int64 | `ledger.validate` | Number of validations received for this ledger | -| `xrpl.ledger.tx_count` | int64 | `ledger.build`, `tx.apply` | Transactions in the ledger | -| `xrpl.ledger.tx_failed` | int64 | `ledger.build`, `tx.apply` | Failed transactions in the ledger | +| Attribute | Type | Set On | Description | +| --------------------- | ------- | ------------------------------------------------- | ------------------------------------------------ | +| `ledger_seq` | int64 | `ledger.build`, `ledger.validate`, `ledger.store` | Ledger sequence number | +| `close_time` | int64 | `ledger.build` | Ledger close time (epoch seconds) | +| `close_time_correct` | boolean | `ledger.build` | Whether close time was agreed upon by validators | +| `close_resolution_ms` | int64 | `ledger.build` | Close time rounding granularity in milliseconds | +| `tx_count` | int64 | `tx.apply` | Transactions applied to the ledger | +| `tx_failed` | int64 | `tx.apply` | Failed transactions in the apply set | +| `validations` | int64 | `ledger.validate` | Number of validations received for this ledger | -**Tempo query**: `{span.xrpl.ledger.seq=12345}` to find all spans for a specific ledger. +The apply-step span `tx.apply` (child of `ledger.build`) carries `tx_count`/`tx_failed`; +the parent `ledger.build` carries `ledger_seq` and the close-time attributes. + +**Tempo query**: `{span.ledger_seq=12345}` to find all spans for a specific ledger. #### Peer Attributes -| Attribute | Type | Set On | Description | -| ------------------------------ | ------- | ---------------------------------------------------------------- | ---------------------------------------------------- | -| `xrpl.peer.id` | int64 | `tx.receive`, `peer.proposal.receive`, `peer.validation.receive` | Peer identifier | -| `xrpl.peer.proposal.trusted` | boolean | `peer.proposal.receive` | Whether the proposal came from a trusted validator | -| `xrpl.peer.validation.trusted` | boolean | `peer.validation.receive` | Whether the validation came from a trusted validator | +| Attribute | Type | Set On | Description | +| -------------------- | ------- | ---------------------------------------------------------------- | ---------------------------------------------------- | +| `peer_id` | int64 | `tx.receive`, `peer.proposal.receive`, `peer.validation.receive` | Peer identifier | +| `proposal_trusted` | boolean | `peer.proposal.receive` | Whether the proposal came from a trusted validator | +| `validation_trusted` | boolean | `peer.validation.receive` | Whether the validation came from a trusted validator | +| `validation_full` | boolean | `peer.validation.receive` | Whether the validation is a full validation | +| `xrpl.ledger.hash` | string | `peer.validation.receive` | Validated ledger hash (**dotted** — shared constant) | -**Prometheus labels**: `xrpl_peer_proposal_trusted`, `xrpl_peer_validation_trusted` (SpanMetrics dimensions). +**Prometheus labels**: `proposal_trusted`, `validation_trusted` (SpanMetrics dimensions). + +#### PathFind Attributes + +| Attribute | Type | Set On | Description | +| ------------------------- | ------- | --------------------- | ---------------------------------------- | +| `pathfind_source_account` | string | `pathfind.request` | Originating account for the path search | +| `pathfind_dest_account` | string | `pathfind.request` | Destination account | +| `pathfind_fast` | boolean | `pathfind.compute` | Whether fast pathfinding mode is enabled | +| `pathfind_search_level` | int64 | `pathfind.discover` | Depth of graph exploration | +| `pathfind_num_paths` | int64 | `pathfind.discover` | Total paths produced | +| `pathfind_ledger_index` | int64 | `pathfind.update_all` | Target ledger index | +| `pathfind_num_requests` | int64 | `pathfind.update_all` | Active requests recomputed | --- @@ -262,17 +427,34 @@ The OTel Collector's SpanMetrics connector automatically generates RED (Rate, Er **Standard labels on every metric**: `span_name`, `status_code`, `service_name`, `span_kind` -**Additional dimension labels** (configured in `otel-collector-config.yaml`): +**Additional dimension labels** (configured in `otel-collector-config.yaml`). +The Prometheus label is the **bare span-attribute key verbatim** — the +SpanMetrics connector does not rewrite or prefix it: -| Span Attribute | Prometheus Label | Applies To | -| --------------------- | ------------------------------ | ---------------------------------------------- | -| `command` | `xrpl_rpc_command` | `rpc.command.*` | -| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` | -| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` | -| `local` | `xrpl_tx_local` | `tx.process` | -| `proposal_trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` | -| `validation_trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` | -| `stage` | `stage` | `tx.preflight`, `tx.preclaim`, `tx.transactor` | +| Prometheus Label / Span Attribute | Type | Applies To | +| --------------------------------- | ------- | ---------------------------------------------- | +| `command` | string | `rpc.command.*` | +| `rpc_status` | string | `rpc.command.*` | +| `consensus_mode` | string | `consensus.round`, `consensus.ledger_close` | +| `close_time_correct` | boolean | `consensus.accept.apply` | +| `local` | boolean | `tx.process` | +| `suppressed` | boolean | `tx.receive` | +| `proposal_trusted` | boolean | `peer.proposal.receive` | +| `validation_trusted` | boolean | `peer.validation.receive` | +| `tx_type` | string | `tx.*`, `txq.enqueue` | +| `ter_result` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | +| `stage` | string | `tx.preflight`, `tx.preclaim`, `tx.transactor` | +| `txq_status` | string | `txq.enqueue`, `txq.accept.tx` | +| `consensus_state` | string | `consensus.accept.apply` | +| `load_type` | string | `rpc.command.*` | +| `is_batch` | boolean | `rpc.process` | +| `mode_new` | string | `consensus.mode_change` | +| `consensus_stalled` | boolean | `consensus.check` | +| `consensus_phase` | string | `consensus.round` | +| `consensus_result` | string | `consensus.check` | +| `method` | string | `grpc.` | +| `grpc_role` | string | `grpc.` | +| `grpc_status` | string | `grpc.` | The `stage` dimension (3 values: `preflight`, `preclaim`, `apply`) turns the apply-pipeline spans into per-stage RED metrics with no native instruments — the @@ -437,38 +619,47 @@ For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), fo | What to Find | Tempo TraceQL Query | | ------------------------ | ------------------------------------------------------------------------------ | -| All RPC calls | `{resource.service.name="xrpld" && name="rpc.request"}` | +| All RPC calls | `{resource.service.name="xrpld" && name="rpc.http_request"}` | | Specific RPC command | `{resource.service.name="xrpld" && name="rpc.command.server_info"}` | | Slow RPC calls | `{resource.service.name="xrpld" && name=~"rpc.command.*"} \| duration > 100ms` | | Failed RPC calls | `{span.rpc_status="error"}` | -| Specific transaction | `{span.xrpl.tx.hash=""}` | -| Local transactions only | `{span.xrpl.tx.local=true}` | -| Consensus rounds | `{resource.service.name="xrpld" && name="consensus.accept"}` | -| Rounds by mode | `{span.xrpl.consensus.mode="proposing"}` | -| Specific ledger | `{span.xrpl.ledger.seq=12345}` | -| Peer proposals (trusted) | `{span.xrpl.peer.proposal.trusted=true}` | +| gRPC method calls | `{resource.service.name="xrpld" && name="grpc.GetLedger"}` | +| Specific transaction | `{span.tx_hash=""}` | +| Local transactions only | `{span.local=true}` | +| Consensus rounds | `{resource.service.name="xrpld" && name="consensus.round"}` | +| Rounds by mode | `{span.consensus_mode="Proposing"}` | +| Specific ledger | `{span.ledger_seq=12345}` | +| Peer proposals (trusted) | `{span.proposal_trusted=true}` | ### Trace Structure A typical RPC trace shows the span hierarchy: ``` -rpc.request (ServerHandler) +rpc.http_request (ServerHandler) └── rpc.process (ServerHandler) └── rpc.command.server_info (RPCHandler) ``` -A consensus round produces independent spans (not parent-child): +A consensus round groups its lifecycle spans under a single root +(`consensus.round`); the build/ledger spans run as their own trees: ``` -consensus.ledger_close (close event) -consensus.proposal.send (broadcast proposal) -ledger.build (build new ledger) - └── tx.apply (apply transaction set) -consensus.accept (accept result) -consensus.validation.send (send validation) -ledger.validate (promote to validated) -ledger.store (persist to DB) +consensus.round (root — one per round) + ├── consensus.phase.open (open phase) + ├── consensus.proposal.send (broadcast proposal) + ├── consensus.ledger_close (close event) + ├── consensus.establish (establish phase) + ├── consensus.update_positions (position updates) + ├── consensus.check (threshold check) + ├── consensus.accept (accept result) + │ └── consensus.accept.apply (apply, jtACCEPT thread) + └── consensus.validation.send (send validation, follows-from link) + +ledger.build (build new ledger) + └── tx.apply (apply transaction set) +ledger.validate (promote to validated) +ledger.store (persist to DB) ``` --- @@ -481,19 +672,19 @@ ledger.store (persist to DB) ```promql # RPC request rate by command (last 5 minutes) -sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~"rpc.command.*"}[5m])) +sum by (command) (rate(traces_span_metrics_calls_total{span_name=~"rpc.command.*"}[5m])) # RPC p95 latency by command -histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m]))) +histogram_quantile(0.95, sum by (le, command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m]))) # Consensus round duration p95 -histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name="consensus.accept"}[5m]))) +histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name="consensus.round"}[5m]))) # Transaction processing rate (local vs relay) -sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m])) +sum by (local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m])) # Trusted vs untrusted proposal rate -sum by (xrpl_peer_proposal_trusted) (rate(traces_span_metrics_calls_total{span_name="peer.proposal.receive"}[5m])) +sum by (proposal_trusted) (rate(traces_span_metrics_calls_total{span_name="peer.proposal.receive"}[5m])) ``` ### StatsD Metrics From 24f60ab1d4c5b4faafa09d771d7831ad9697850c Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:35:34 +0100 Subject: [PATCH 08/10] feat(telemetry): make consensus panels show real consensus timing and rounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The consensus duration panels plotted span wall-clock (traces_span_metrics_duration_milliseconds), which is ~3-8 ms of instrumentation overhead, not the real consensus time (~3000 ms). And the close-time value panels plotted an ever-rising absolute epoch line. Rework them to answer the actual operational questions, all from attributes that already exist on the consensus spans: - Time to Reach Consensus (p50/p95) and Average Time to Reach Consensus: round_time_ms on consensus.accept — the wall-clock to agree a ledger. - Consensus Rounds per Ledger (Establish Count): avg and max of establish_count on consensus.establish — how many proposal rounds it took to converge (1 = first proposal). - Previous Round Time per Ledger: previous_round_time_ms on consensus.round. Reorder the dashboard into an investigation flow: health/throughput -> time-to-consensus and rounds -> ledger close/apply timing -> close-time detail -> failures/mode/mismatch. Assign stable sequential panel ids. Verified each query returns data via the Grafana datasource proxy (p95 ~4096 ms, avg ~2825 ms, rounds ~2). Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 782 ++++++++++-------- 1 file changed, 438 insertions(+), 344 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index e9545e9d0b..a13b4c3e63 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -8,123 +8,6 @@ "id": null, "links": [], "panels": [ - { - "title": "Consensus Round Duration", - "description": "p95 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))", - "legendFormat": "P95 Round Duration [{{exported_instance}}]" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "axisLabel": "Duration (ms)", - "spanNulls": true, - "insertNulls": false, - "showPoints": "auto", - "pointSize": 3 - } - }, - "overrides": [] - } - }, - { - "title": "Consensus Proposals Sent Rate", - "description": "Rate at which this node sends consensus proposals to the network. Sourced from the consensus.proposal.send span (RCLConsensus.cpp) which fires each time the node proposes a transaction set. The span carries xrpl.consensus.round identifying the consensus round number. A healthy proposing node should show steady proposal output.", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.proposal.send\"}[5m]))", - "legendFormat": "Proposals / Sec [{{exported_instance}}]" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ops", - "custom": { - "axisLabel": "Proposals / Sec", - "spanNulls": true, - "insertNulls": false, - "showPoints": "auto", - "pointSize": 3 - } - }, - "overrides": [] - } - }, - { - "title": "Ledger Close Duration", - "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp) measures the time from when consensus triggers a ledger close to completion. Carries ledger_seq and consensus_mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - }, - "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m])))", - "legendFormat": "P95 Close Duration [{{exported_instance}}]" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "axisLabel": "Duration (ms)", - "spanNulls": true, - "insertNulls": false, - "showPoints": "auto", - "pointSize": 3 - } - }, - "overrides": [] - } - }, { "title": "Validation Send Rate", "description": "Rate at which this node sends ledger validations to the network. Sourced from the consensus.validation.send span (RCLConsensus.cpp). Each validation confirms the node has fully validated a ledger. The span carries ledger_seq and proposing. Should closely track the ledger close rate when the node is healthy.", @@ -132,8 +15,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 8 + "x": 0, + "y": 0 }, "options": { "tooltip": { @@ -155,73 +38,8 @@ "unit": "ops" }, "overrides": [] - } - }, - { - "title": "Ledger Apply Duration (doAccept)", - "description": "Time spent applying the consensus result to build a new ledger. Measured by the consensus.accept.apply span in doAccept().", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))", - "legendFormat": "P95 Apply Duration [{{exported_instance}}]" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "axisLabel": "Duration (ms)", - "spanNulls": true, - "insertNulls": false, - "showPoints": "auto", - "pointSize": 3 - } - }, - "overrides": [] - } - }, - { - "title": "Close Time Agreement", - "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "sum by (close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))", - "legendFormat": "Close Time Correct={{close_time_correct}} [{{exported_instance}}]" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ops", - "custom": { - "axisLabel": "Rounds / Sec", - "spanNulls": true, - "insertNulls": false, - "showPoints": "auto", - "pointSize": 3 - } - }, - "overrides": [] - } + "id": 1 }, { "title": "Consensus Mode Over Time", @@ -230,8 +48,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 24 + "x": 12, + "y": 0 }, "options": { "tooltip": { @@ -260,7 +78,48 @@ } }, "overrides": [] - } + }, + "id": 2 + }, + { + "title": "Consensus Proposals Sent Rate", + "description": "Rate at which this node sends consensus proposals to the network. Sourced from the consensus.proposal.send span (RCLConsensus.cpp) which fires each time the node proposes a transaction set. The span carries xrpl.consensus.round identifying the consensus round number. A healthy proposing node should show steady proposal output.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.proposal.send\"}[5m]))", + "legendFormat": "Proposals / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Proposals / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + }, + "id": 3 }, { "title": "Accept vs Close Rate", @@ -270,7 +129,7 @@ "h": 8, "w": 12, "x": 12, - "y": 24 + "y": 8 }, "options": { "tooltip": { @@ -306,7 +165,8 @@ } }, "overrides": [] - } + }, + "id": 4 }, { "title": "Validation vs Close Rate", @@ -316,7 +176,7 @@ "h": 8, "w": 12, "x": 0, - "y": 32 + "y": 16 }, "options": { "tooltip": { @@ -352,7 +212,320 @@ } }, "overrides": [] - } + }, + "id": 5 + }, + { + "title": "Time to Reach Consensus (p50/p95)", + "description": "Wall-clock time for the network to agree a ledger, from the round_time_ms attribute on consensus.accept. This is the real consensus latency, not span overhead.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "queryType": "traceql", + "query": "{name=\"consensus.accept\" && resource.service.instance.id=~\"$node\"} | quantile_over_time(span.round_time_ms, .5)", + "legendFormat": "P50 Time to Consensus [{{resource.service.instance.id}}]", + "refId": "A" + }, + { + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "queryType": "traceql", + "refId": "B", + "query": "{name=\"consensus.accept\" && resource.service.instance.id=~\"$node\"} | quantile_over_time(span.round_time_ms, .95)", + "legendFormat": "P95 Time to Consensus [{{resource.service.instance.id}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Time to Consensus (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + }, + "id": 6 + }, + { + "title": "Average Time to Reach Consensus", + "description": "Mean time to agree a ledger (round_time_ms on consensus.accept), averaged over the window. Tracks the typical consensus latency trend.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "queryType": "traceql", + "query": "{name=\"consensus.accept\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.round_time_ms)", + "legendFormat": "Avg Time to Consensus [{{resource.service.instance.id}}]", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Time to Consensus (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + }, + "id": 7 + }, + { + "title": "Consensus Rounds per Ledger (Establish Count)", + "description": "Number of establish-phase iterations (proposal rounds) before agreement, from establish_count on consensus.establish. 1 = agreed on first proposal; higher = more rounds of re-proposing to converge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc", + "maxHeight": 500 + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max", "lastNotNull"] + } + }, + "targets": [ + { + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "queryType": "traceql", + "query": "{name=\"consensus.establish\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.establish_count)", + "legendFormat": "Avg Rounds [{{resource.service.instance.id}}]", + "refId": "A" + }, + { + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "queryType": "traceql", + "query": "{name=\"consensus.establish\" && resource.service.instance.id=~\"$node\"} | max_over_time(span.establish_count)", + "legendFormat": "Max Rounds [{{resource.service.instance.id}}]", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Rounds per Ledger", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + }, + "id": 8 + }, + { + "title": "Previous Round Time per Ledger", + "description": "Duration of the previous consensus round (previous_round_time_ms on consensus.round). Complements time-to-consensus with the prior round's cost.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "queryType": "traceql", + "query": "{name=\"consensus.round\" && resource.service.instance.id=~\"$node\"} | quantile_over_time(span.previous_round_time_ms, .95)", + "legendFormat": "P95 Previous Round [{{resource.service.instance.id}}]", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Round Time (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + }, + "id": 9 + }, + { + "title": "Position Update Duration", + "description": "p95 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))", + "legendFormat": "P95 Update [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + }, + "id": 10 + }, + { + "title": "Ledger Close Duration", + "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp) measures the time from when consensus triggers a ledger close to completion. Carries ledger_seq and consensus_mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m])))", + "legendFormat": "P95 Close Duration [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + }, + "id": 11 + }, + { + "title": "Ledger Apply Duration (doAccept)", + "description": "Time spent applying the consensus result to build a new ledger. Measured by the consensus.accept.apply span in doAccept().", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))", + "legendFormat": "P95 Apply Duration [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + }, + "id": 12 }, { "title": "Consensus Accept Duration Heatmap", @@ -361,8 +534,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 32 + "x": 0, + "y": 48 }, "options": { "tooltip": { @@ -388,7 +561,42 @@ "defaults": { "unit": "ms" } - } + }, + "id": 13 + }, + { + "title": "Close Time Agreement", + "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))", + "legendFormat": "Close Time Correct={{close_time_correct}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Rounds / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + }, + "id": 14 }, { "title": "Close Time: Raw Proposals (Per Node)", @@ -398,7 +606,7 @@ "h": 8, "w": 12, "x": 0, - "y": 40 + "y": 56 }, "fieldConfig": { "defaults": { @@ -434,7 +642,8 @@ "legendFormat": "Raw Close Time [{{resource.service.instance.id}}]", "refId": "A" } - ] + ], + "id": 15 }, { "title": "Close Time: Effective / Quantized", @@ -444,7 +653,7 @@ "h": 8, "w": 12, "x": 12, - "y": 40 + "y": 56 }, "fieldConfig": { "defaults": { @@ -480,7 +689,8 @@ "legendFormat": "Effective Close Time [{{resource.service.instance.id}}]", "refId": "A" } - ] + ], + "id": 16 }, { "title": "Close Time Vote Bins & Resolution", @@ -490,7 +700,7 @@ "h": 8, "w": 12, "x": 0, - "y": 48 + "y": 64 }, "fieldConfig": { "defaults": { @@ -570,7 +780,8 @@ "legendFormat": "Avg Resolution (ms)", "refId": "B" } - ] + ], + "id": 17 }, { "title": "Close Time Resolution Direction", @@ -580,7 +791,7 @@ "h": 8, "w": 12, "x": 12, - "y": 48 + "y": 64 }, "fieldConfig": { "defaults": { @@ -617,7 +828,8 @@ "legendFormat": "Resolution Direction [{{span.resolution_direction}}]", "refId": "A" } - ] + ], + "id": 18 }, { "title": "Close Time Bin Distribution", @@ -625,9 +837,9 @@ "type": "barchart", "gridPos": { "h": 8, - "w": 24, + "w": 12, "x": 0, - "y": 56 + "y": 72 }, "fieldConfig": { "defaults": { @@ -664,7 +876,8 @@ "legendFormat": "{{span.close_time_vote_bins}} Vote Bins", "refId": "A" } - ] + ], + "id": 19 }, { "title": "Consensus Outcome Distribution", @@ -672,9 +885,9 @@ "type": "piechart", "gridPos": { "h": 8, - "w": 8, - "x": 0, - "y": 64 + "w": 12, + "x": 12, + "y": 72 }, "options": { "legend": { @@ -700,7 +913,8 @@ "unit": "short" }, "overrides": [] - } + }, + "id": 20 }, { "title": "Consensus Failures Over Time", @@ -708,9 +922,9 @@ "type": "timeseries", "gridPos": { "h": 8, - "w": 16, - "x": 8, - "y": 64 + "w": 12, + "x": 0, + "y": 80 }, "options": { "tooltip": { @@ -746,131 +960,8 @@ } }, "overrides": [] - } - }, - { - "title": "Consensus Round Duration (Full Round)", - "description": "p95 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 72 }, - "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))", - "legendFormat": "P95 Round [{{exported_instance}}]" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "axisLabel": "Duration (ms)", - "spanNulls": true, - "insertNulls": false, - "showPoints": "auto", - "pointSize": 3 - } - }, - "overrides": [] - } - }, - { - "title": "Consensus Phase Duration (Open vs Establish)", - "description": "p95 duration of the open phase (transaction collection) vs the establish phase (proposal convergence). The consensus.phase.open and consensus.establish spans decompose round latency, so an operator can tell whether slowness is in collecting transactions or reaching agreement.", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 72 - }, - "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.phase.open\"}[5m])))", - "legendFormat": "P95 Open Phase [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.establish\"}[5m])))", - "legendFormat": "P95 Establish Phase [{{exported_instance}}]" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "axisLabel": "Duration (ms)", - "spanNulls": true, - "insertNulls": false, - "showPoints": "auto", - "pointSize": 3 - } - }, - "overrides": [] - } - }, - { - "title": "Position Update Duration", - "description": "p95 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 80 - }, - "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))", - "legendFormat": "P95 Update [{{exported_instance}}]" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "axisLabel": "Duration (ms)", - "spanNulls": true, - "insertNulls": false, - "showPoints": "auto", - "pointSize": 3 - } - }, - "overrides": [] - } + "id": 21 }, { "title": "Consensus Stall Rate", @@ -916,7 +1007,8 @@ } }, "overrides": [] - } + }, + "id": 22 }, { "title": "Consensus Mode-Change Rate by Target Mode", @@ -924,7 +1016,7 @@ "type": "timeseries", "gridPos": { "h": 8, - "w": 24, + "w": 12, "x": 0, "y": 88 }, @@ -955,7 +1047,8 @@ } }, "overrides": [] - } + }, + "id": 23 }, { "title": "Ledger History Mismatch Rate by Reason", @@ -963,9 +1056,9 @@ "type": "timeseries", "gridPos": { "h": 8, - "w": 24, - "x": 0, - "y": 96 + "w": 12, + "x": 12, + "y": 88 }, "options": { "tooltip": { @@ -994,7 +1087,8 @@ } }, "overrides": [] - } + }, + "id": 24 } ], "schemaVersion": 39, From a23d83f39350ff16df77fe10e5c1105c6cc3d2d9 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:36:34 +0100 Subject: [PATCH 09/10] docs(telemetry): add ledger.acquire to 09-doc + fix peer-quality dashboard metric prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 9 introduces the ledger.acquire span (InboundLedger fetch) that phases 7-8 do not have, so the forward-merged 09-data-collection-reference inventory is extended here: - §1.1: add ledger.acquire to the Ledger span table. - §1.2: add its attributes (acquire_reason, timeouts, peer_count, outcome) and note it also sets ledger_seq; bump the span count. Also fix two stale StatsD metric references in the Peer Quality dashboard (xrpld-peer-quality.json): rippled_Peer_Finder_Active_{Inbound,Outbound}_Peers -> xrpld_Peer_Finder_* to match the xrpld_ metric prefix the rest of the stack uses. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../09-data-collection-reference.md | 18 ++++++++++++------ .../grafana/dashboards/xrpld-peer-quality.json | 4 ++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md index c64d4ddea7..8d2768f033 100644 --- a/OpenTelemetryPlan/09-data-collection-reference.md +++ b/OpenTelemetryPlan/09-data-collection-reference.md @@ -78,7 +78,7 @@ There are two independent telemetry pipelines entering a single **OTel Collector ## 1. OpenTelemetry Spans -### 1.1 Complete Span Inventory (~36 spans) +### 1.1 Complete Span Inventory (~37 spans) > **See also**: [02-design-decisions.md §2.3](./02-design-decisions.md#23-span-naming-conventions) for naming conventions and the full span catalog with rationale. [04-code-samples.md §4.6](./04-code-samples.md#46-span-flow-visualization) for span flow diagrams. @@ -193,11 +193,12 @@ round trace via context propagation rather than direct parenting. The Controlled by `trace_ledger=1` in `[telemetry]` config. -| Span Name | Parent | Source File | Description | -| ----------------- | ------ | ---------------- | ---------------------------------------------- | -| `ledger.build` | — | BuildLedger.cpp | Build new ledger from accepted transaction set | -| `ledger.validate` | — | LedgerMaster.cpp | Ledger promoted to validated status | -| `ledger.store` | — | LedgerMaster.cpp | Ledger stored to database/history | +| Span Name | Parent | Source File | Description | +| ----------------- | ------ | ----------------- | ---------------------------------------------- | +| `ledger.build` | — | BuildLedger.cpp | Build new ledger from accepted transaction set | +| `ledger.validate` | — | LedgerMaster.cpp | Ledger promoted to validated status | +| `ledger.store` | — | LedgerMaster.cpp | Ledger stored to database/history | +| `ledger.acquire` | — | InboundLedger.cpp | Fetch a missing ledger from peers | **Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"ledger.*"}` @@ -382,9 +383,14 @@ aggregation. Per the 2026-05-13 naming redesign, span-attribute keys use the | `tx_count` | int64 | `tx.apply` | Transactions applied to the ledger | | `tx_failed` | int64 | `tx.apply` | Failed transactions in the apply set | | `validations` | int64 | `ledger.validate` | Number of validations received for this ledger | +| `acquire_reason` | string | `ledger.acquire` | Why the ledger fetch was triggered | +| `timeouts` | int64 | `ledger.acquire` | Number of fetch timeouts | +| `peer_count` | int64 | `ledger.acquire` | Peers queried during the fetch | +| `outcome` | string | `ledger.acquire` | Fetch outcome | The apply-step span `tx.apply` (child of `ledger.build`) carries `tx_count`/`tx_failed`; the parent `ledger.build` carries `ledger_seq` and the close-time attributes. +`ledger.acquire` (InboundLedger) also sets `ledger_seq`. **Tempo query**: `{span.ledger_seq=12345}` to find all spans for a specific ledger. diff --git a/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json b/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json index 6ffc83bcd9..72995c7996 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json +++ b/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json @@ -303,14 +303,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_Peer_Finder_Active_Inbound_Peers{exported_instance=~\"$node\"}", + "expr": "xrpld_Peer_Finder_Active_Inbound_Peers{exported_instance=~\"$node\"}", "legendFormat": "Inbound [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_Peer_Finder_Active_Outbound_Peers{exported_instance=~\"$node\"}", + "expr": "xrpld_Peer_Finder_Active_Outbound_Peers{exported_instance=~\"$node\"}", "legendFormat": "Outbound [{{exported_instance}}]" } ], From 2ee4d2ff2d7fb29bb3266e11d15b0f64b0f16d7e Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:40:41 +0100 Subject: [PATCH 10/10] fix(telemetry): show consensus rounds as integer distribution, widen table panels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Number of rounds is an integer, but avg_over_time(establish_count) produced fractional values (2.1). Switch the Rounds panel to count_over_time() by (span.establish_count): one integer series per round count (1/2/3...), showing how many ledgers needed that many establish rounds — the meaningful distribution, inherently integer (decimals=0). Apply dashboard rule 9: panels with a right-side table legend take full width. Widen "Consensus Rounds per Ledger" and "Consensus Outcome Distribution" to w=24 and re-tile the dashboard. Verified via the Grafana proxy: rounds=2 dominates (~11-12 ledgers), rounds=3 occasional. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 65 ++++++++----------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index a13b4c3e63..419db64402 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -313,13 +313,13 @@ }, { "title": "Consensus Rounds per Ledger (Establish Count)", - "description": "Number of establish-phase iterations (proposal rounds) before agreement, from establish_count on consensus.establish. 1 = agreed on first proposal; higher = more rounds of re-proposing to converge.", + "description": "Distribution of ledgers by the number of establish-phase rounds (establish_count) needed to converge. Each series is an integer round count: 1 = agreed on the first proposal, 2/3 = needed more rounds. Watch for a shift toward higher counts under disagreement.", "type": "timeseries", "gridPos": { "h": 8, - "w": 12, - "x": 12, - "y": 24 + "w": 24, + "x": 0, + "y": 32 }, "options": { "tooltip": { @@ -340,31 +340,22 @@ "uid": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.establish\" && resource.service.instance.id=~\"$node\"} | avg_over_time(span.establish_count)", - "legendFormat": "Avg Rounds [{{resource.service.instance.id}}]", - "refId": "A" - }, - { - "datasource": { - "type": "tempo", - "uid": "tempo" - }, - "queryType": "traceql", - "query": "{name=\"consensus.establish\" && resource.service.instance.id=~\"$node\"} | max_over_time(span.establish_count)", - "legendFormat": "Max Rounds [{{resource.service.instance.id}}]", - "refId": "B" + "refId": "A", + "query": "{name=\"consensus.establish\" && resource.service.instance.id=~\"$node\"} | count_over_time() by (span.establish_count)", + "legendFormat": "{{span.establish_count}} Rounds" } ], "fieldConfig": { "defaults": { "unit": "none", "custom": { - "axisLabel": "Rounds per Ledger", + "axisLabel": "Ledgers in Window", "spanNulls": true, "insertNulls": false, "showPoints": "auto", "pointSize": 3 - } + }, + "decimals": 0 }, "overrides": [] }, @@ -378,7 +369,7 @@ "h": 8, "w": 12, "x": 0, - "y": 32 + "y": 40 }, "options": { "tooltip": { @@ -421,7 +412,7 @@ "h": 8, "w": 12, "x": 12, - "y": 32 + "y": 40 }, "options": { "tooltip": { @@ -461,7 +452,7 @@ "h": 8, "w": 12, "x": 0, - "y": 40 + "y": 48 }, "options": { "tooltip": { @@ -501,7 +492,7 @@ "h": 8, "w": 12, "x": 12, - "y": 40 + "y": 48 }, "targets": [ { @@ -535,7 +526,7 @@ "h": 8, "w": 12, "x": 0, - "y": 48 + "y": 56 }, "options": { "tooltip": { @@ -572,7 +563,7 @@ "h": 8, "w": 12, "x": 12, - "y": 48 + "y": 56 }, "targets": [ { @@ -606,7 +597,7 @@ "h": 8, "w": 12, "x": 0, - "y": 56 + "y": 64 }, "fieldConfig": { "defaults": { @@ -653,7 +644,7 @@ "h": 8, "w": 12, "x": 12, - "y": 56 + "y": 64 }, "fieldConfig": { "defaults": { @@ -700,7 +691,7 @@ "h": 8, "w": 12, "x": 0, - "y": 64 + "y": 72 }, "fieldConfig": { "defaults": { @@ -791,7 +782,7 @@ "h": 8, "w": 12, "x": 12, - "y": 64 + "y": 72 }, "fieldConfig": { "defaults": { @@ -839,7 +830,7 @@ "h": 8, "w": 12, "x": 0, - "y": 72 + "y": 80 }, "fieldConfig": { "defaults": { @@ -885,9 +876,9 @@ "type": "piechart", "gridPos": { "h": 8, - "w": 12, - "x": 12, - "y": 72 + "w": 24, + "x": 0, + "y": 88 }, "options": { "legend": { @@ -924,7 +915,7 @@ "h": 8, "w": 12, "x": 0, - "y": 80 + "y": 96 }, "options": { "tooltip": { @@ -971,7 +962,7 @@ "h": 8, "w": 12, "x": 12, - "y": 80 + "y": 96 }, "options": { "tooltip": { @@ -1018,7 +1009,7 @@ "h": 8, "w": 12, "x": 0, - "y": 88 + "y": 104 }, "options": { "tooltip": { @@ -1058,7 +1049,7 @@ "h": 8, "w": 12, "x": 12, - "y": 88 + "y": 104 }, "options": { "tooltip": {