From 3b22d73bef4299e8567c4313ad9ae96d76d65db6 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 6 Mar 2026 17:21:22 +0000 Subject: [PATCH] Add consensus close time panels and docs for accept.apply span - Add Ledger Apply Duration and Close Time Agreement panels to consensus-health dashboard - Add consensus.accept.apply to telemetry runbook with TraceQL queries for close time disagreements and consensus failures - Add span to TESTING.md expected span catalog and verification loop Co-Authored-By: Claude Opus 4.6 --- docker/telemetry/TESTING.md | 28 +++++----- .../grafana/dashboards/consensus-health.json | 43 +++++++++++++++ docs/telemetry-runbook.md | 54 +++++++++++++------ 3 files changed, 95 insertions(+), 30 deletions(-) diff --git a/docker/telemetry/TESTING.md b/docker/telemetry/TESTING.md index d93f2256da..4d9853e8f7 100644 --- a/docker/telemetry/TESTING.md +++ b/docker/telemetry/TESTING.md @@ -372,18 +372,19 @@ See the "Verification Queries" section below. All 12 production span names instrumented across Phases 2-4: -| Span Name | Source File | Phase | Key Attributes | How to Trigger | -| --------------------------- | --------------------- | ----- | ---------------------------------------------------------- | ------------------------- | -| `rpc.request` | ServerHandler.cpp:271 | 2 | -- | Any HTTP RPC call | -| `rpc.process` | ServerHandler.cpp:573 | 2 | -- | Any HTTP RPC call | -| `rpc.ws_message` | ServerHandler.cpp:384 | 2 | -- | WebSocket RPC message | -| `rpc.command.` | RPCHandler.cpp:161 | 2 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role` | Any RPC command | -| `tx.process` | NetworkOPs.cpp:1227 | 3 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Submit transaction | -| `tx.receive` | PeerImp.cpp:1273 | 3 | `xrpl.peer.id` | Peer relays transaction | -| `consensus.proposal.send` | RCLConsensus.cpp:177 | 4 | `xrpl.consensus.round` | Consensus proposing phase | -| `consensus.ledger_close` | RCLConsensus.cpp:282 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | -| `consensus.accept` | RCLConsensus.cpp:395 | 4 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted | -| `consensus.validation.send` | RCLConsensus.cpp:753 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent | +| Span Name | Source File | Phase | Key Attributes | How to Trigger | +| --------------------------- | --------------------- | ----- | --------------------------------------------------------------------------------- | ------------------------- | +| `rpc.request` | ServerHandler.cpp:271 | 2 | -- | Any HTTP RPC call | +| `rpc.process` | ServerHandler.cpp:573 | 2 | -- | Any HTTP RPC call | +| `rpc.ws_message` | ServerHandler.cpp:384 | 2 | -- | WebSocket RPC message | +| `rpc.command.` | RPCHandler.cpp:161 | 2 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role` | Any RPC command | +| `tx.process` | NetworkOPs.cpp:1227 | 3 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Submit transaction | +| `tx.receive` | PeerImp.cpp:1273 | 3 | `xrpl.peer.id` | Peer relays transaction | +| `consensus.proposal.send` | RCLConsensus.cpp:177 | 4 | `xrpl.consensus.round` | Consensus proposing phase | +| `consensus.ledger_close` | RCLConsensus.cpp:282 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | +| `consensus.accept` | RCLConsensus.cpp:395 | 4 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted | +| `consensus.validation.send` | RCLConsensus.cpp:753 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent | +| `consensus.accept.apply` | RCLConsensus.cpp:453 | 4 | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state` | Ledger apply + close time | --- @@ -407,7 +408,8 @@ for op in "rpc.request" "rpc.process" \ "rpc.command.server_info" "rpc.command.server_state" "rpc.command.ledger" \ "tx.process" "tx.receive" \ "consensus.proposal.send" "consensus.ledger_close" \ - "consensus.accept" "consensus.validation.send"; do + "consensus.accept" "consensus.accept.apply" \ + "consensus.validation.send"; do count=$(curl -s "$JAEGER/api/traces?service=rippled&operation=$op&limit=5&lookback=1h" \ | jq '.data | length') printf "%-35s %s traces\n" "$op" "$count" diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 5a03238e15..d9fe94248f 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -82,6 +82,49 @@ }, "overrides": [] } + }, + { + "title": "Ledger Apply Duration (doAccept)", + "description": "Time spent applying the consensus result to build a new ledger. Measured by the consensus.accept.apply span in doAccept().", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept.apply\"}[5m])))", + "legendFormat": "p95 apply duration" + }, + { + "datasource": { "type": "prometheus" }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept.apply\"}[5m])))", + "legendFormat": "p50 apply duration" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms" + }, + "overrides": [] + } + }, + { + "title": "Close Time Agreement", + "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on xrpl.consensus.close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "targets": [ + { + "datasource": { "type": "prometheus" }, + "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\"}[5m]))", + "legendFormat": "total rounds/sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index ebb403e184..8d798e3353 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -78,12 +78,29 @@ All spans instrumented in rippled, grouped by subsystem: ### Consensus Spans (Phase 4) -| Span Name | Source File | Attributes | Description | -| --------------------------- | -------------------- | ---------------------------------------------------------- | ---------------------------- | -| `consensus.proposal.send` | RCLConsensus.cpp:177 | `xrpl.consensus.round` | Consensus proposal broadcast | -| `consensus.ledger_close` | RCLConsensus.cpp:282 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | -| `consensus.accept` | RCLConsensus.cpp:395 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted by consensus | -| `consensus.validation.send` | RCLConsensus.cpp:753 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent after accept | +| Span Name | Source File | Attributes | Description | +| --------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------ | +| `consensus.proposal.send` | RCLConsensus.cpp:177 | `xrpl.consensus.round` | Consensus proposal broadcast | +| `consensus.ledger_close` | RCLConsensus.cpp:282 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | +| `consensus.accept` | RCLConsensus.cpp:395 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted by consensus | +| `consensus.validation.send` | RCLConsensus.cpp:753 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent after accept | +| `consensus.accept.apply` | RCLConsensus.cpp:453 | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state`, `proposing`, `round_time_ms`, `ledger.seq` | Ledger application with close time details | + +#### Close Time Queries (Tempo TraceQL) + +``` +# Find rounds where validators disagreed on close time +{name="consensus.accept.apply"} | xrpl.consensus.close_time_correct = false + +# Find consensus failures (moved_on) +{name="consensus.accept.apply"} | xrpl.consensus.state = "moved_on" + +# Find slow ledger applications (>5s) +{name="consensus.accept.apply"} | duration > 5s + +# Find specific ledger's consensus details +{name="consensus.accept.apply"} | xrpl.consensus.ledger.seq = 92345678 +``` ## Prometheus Metrics (Spanmetrics) @@ -156,20 +173,23 @@ Three dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: | Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — | | Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — | | Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — | +| Ledger Apply Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept.apply"})` | — | +| Close Time Agreement | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.accept.apply"}[5m])` | — | ### Span → Metric → Dashboard Summary -| Span Name | Prometheus Metric Filter | Grafana Dashboard | -| --------------------------- | ----------------------------------------- | ---------------------------------- | -| `rpc.request` | `{span_name="rpc.request"}` | — (available but not paneled) | -| `rpc.process` | `{span_name="rpc.process"}` | — (available but not paneled) | -| `rpc.command.*` | `{span_name=~"rpc.command.*"}` | RPC Performance (all 4 panels) | -| `tx.process` | `{span_name="tx.process"}` | Transaction Overview (3 panels) | -| `tx.receive` | `{span_name="tx.receive"}` | Transaction Overview (2 panels) | -| `consensus.accept` | `{span_name="consensus.accept"}` | Consensus Health (Round Duration) | -| `consensus.proposal.send` | `{span_name="consensus.proposal.send"}` | Consensus Health (Proposals Rate) | -| `consensus.ledger_close` | `{span_name="consensus.ledger_close"}` | Consensus Health (Close Duration) | -| `consensus.validation.send` | `{span_name="consensus.validation.send"}` | Consensus Health (Validation Rate) | +| Span Name | Prometheus Metric Filter | Grafana Dashboard | +| --------------------------- | ----------------------------------------- | --------------------------------------------- | +| `rpc.request` | `{span_name="rpc.request"}` | — (available but not paneled) | +| `rpc.process` | `{span_name="rpc.process"}` | — (available but not paneled) | +| `rpc.command.*` | `{span_name=~"rpc.command.*"}` | RPC Performance (all 4 panels) | +| `tx.process` | `{span_name="tx.process"}` | Transaction Overview (3 panels) | +| `tx.receive` | `{span_name="tx.receive"}` | Transaction Overview (2 panels) | +| `consensus.accept` | `{span_name="consensus.accept"}` | Consensus Health (Round Duration) | +| `consensus.proposal.send` | `{span_name="consensus.proposal.send"}` | Consensus Health (Proposals Rate) | +| `consensus.ledger_close` | `{span_name="consensus.ledger_close"}` | Consensus Health (Close Duration) | +| `consensus.validation.send` | `{span_name="consensus.validation.send"}` | Consensus Health (Validation Rate) | +| `consensus.accept.apply` | `{span_name="consensus.accept.apply"}` | Consensus Health (Apply Duration, Close Time) | ## Troubleshooting