From 61c2760296f7387b6debdbf0e3a663d5c6be2ee8 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:32:13 +0100 Subject: [PATCH 01/16] consmetic updates Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- .../grafana/dashboards/system-node-health.json | 16 ++++++++-------- .../grafana/dashboards/xrpld-fee-market.json | 2 +- .../grafana/dashboards/xrpld-job-queue.json | 2 +- .../grafana/dashboards/xrpld-peer-quality.json | 2 +- .../grafana/dashboards/xrpld-rpc-perf-otel.json | 2 +- .../dashboards/xrpld-validator-health.json | 8 ++++---- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/system-node-health.json b/docker/telemetry/grafana/dashboards/system-node-health.json index 9247c33745..29a1785419 100644 --- a/docker/telemetry/grafana/dashboards/system-node-health.json +++ b/docker/telemetry/grafana/dashboards/system-node-health.json @@ -405,7 +405,7 @@ } }, { - "title": "--- OTel: NodeStore I/O ---", + "title": "OTel: NodeStore I/O", "type": "row", "gridPos": { "h": 1, @@ -646,7 +646,7 @@ } }, { - "title": "--- OTel: Cache Hit Rates ---", + "title": "OTel: Cache Hit Rates", "type": "row", "gridPos": { "h": 1, @@ -781,7 +781,7 @@ } }, { - "title": "--- OTel: Object Instance Counts ---", + "title": "OTel: Object Instance Counts", "type": "row", "gridPos": { "h": 1, @@ -839,7 +839,7 @@ } }, { - "title": "--- OTel: Server Info ---", + "title": "OTel: Server Info", "type": "row", "gridPos": { "h": 1, @@ -1185,7 +1185,7 @@ } }, { - "title": "--- OTel: Complete Ledgers & DB ---", + "title": "OTel: Complete Ledgers & DB", "type": "row", "gridPos": { "h": 1, @@ -1357,7 +1357,7 @@ } }, { - "title": "--- OTel: Ledger Economy ---", + "title": "OTel: Ledger Economy", "type": "row", "gridPos": { "h": 1, @@ -1562,7 +1562,7 @@ } }, { - "title": "--- Extended Metrics (Recovered from Phase 6) ---", + "title": "Extended Metrics (Recovered from Phase 6)", "type": "row", "gridPos": { "h": 1, @@ -2054,7 +2054,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "otel", "node-health", "telemetry"], + "tags": ["rippled", "statsd", "node-health", "telemetry"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-fee-market.json b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json index 474853d396..1e782bed1b 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-fee-market.json +++ b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json @@ -306,7 +306,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "otel", "fee-market"], + "tags": ["rippled", "fee-market"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-job-queue.json b/docker/telemetry/grafana/dashboards/xrpld-job-queue.json index e99ae06f22..6d90b0769d 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-job-queue.json +++ b/docker/telemetry/grafana/dashboards/xrpld-job-queue.json @@ -446,7 +446,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "otel", "job-queue"], + "tags": ["rippled", "job-queue"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json b/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json index 394066a92a..af0318e10b 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json +++ b/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json @@ -363,7 +363,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "otel", "peer", "network", "telemetry"], + "tags": ["rippled", "peer", "network", "telemetry"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json b/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json index a6686de72c..2481b61215 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json +++ b/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json @@ -394,7 +394,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "otel", "rpc"], + "tags": ["rippled", "rpc"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-validator-health.json b/docker/telemetry/grafana/dashboards/xrpld-validator-health.json index 55bd9cedac..bfb84e9e14 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-validator-health.json +++ b/docker/telemetry/grafana/dashboards/xrpld-validator-health.json @@ -10,7 +10,7 @@ "links": [], "panels": [ { - "title": "--- Validation Agreement ---", + "title": "Validation Agreement", "type": "row", "gridPos": { "h": 1, @@ -256,7 +256,7 @@ } }, { - "title": "--- Validation Rates ---", + "title": "Validation Rates", "type": "row", "gridPos": { "h": 1, @@ -597,7 +597,7 @@ } }, { - "title": "--- Server State & Consensus ---", + "title": "Server State & Consensus", "type": "row", "gridPos": { "h": 1, @@ -819,7 +819,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "otel", "validator", "health", "telemetry"], + "tags": ["rippled", "validator", "health", "telemetry"], "templating": { "list": [ { From ebc5c5ed9d9e4b87592e2c95ab86d1f605a5a958 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:36:04 +0100 Subject: [PATCH 02/16] fix(telemetry): set service_instance_id in [insight] so dashboards filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit beast::insight metrics exported via OTLP carried no exported_instance label because [insight] omitted service_instance_id (only [telemetry] set it). Every system-* dashboard filters insight metrics with exported_instance=~"$node", and the $node template variable is sourced from label_values(..., exported_instance) — so with the label absent, $node was empty and all insight-backed panels showed no data. Add service_instance_id to [insight] in both telemetry configs, matching the [telemetry] value (xrpld-mainnet / xrpld-devnet). CollectorManager already reads this key and passes it to OTelCollector, which sets the service.instance.id resource attribute. Co-Authored-By: Claude Opus 4.6 --- docker/telemetry/xrpld-telemetry-mainnet.cfg | 5 +++++ docker/telemetry/xrpld-telemetry.cfg | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/docker/telemetry/xrpld-telemetry-mainnet.cfg b/docker/telemetry/xrpld-telemetry-mainnet.cfg index 5a6a4ee639..4393ad398a 100644 --- a/docker/telemetry/xrpld-telemetry-mainnet.cfg +++ b/docker/telemetry/xrpld-telemetry-mainnet.cfg @@ -111,6 +111,11 @@ docker/telemetry/data server=otel endpoint=http://localhost:4318/v1/metrics prefix=xrpld +# Sets the OTel service.instance.id resource attribute, which Prometheus +# exposes as the `exported_instance` label. Dashboards filter on it via the +# $node template variable, so without this every insight-backed panel is +# empty. Matches [telemetry] service_instance_id for a single node identity. +service_instance_id=xrpld-mainnet # --- OpenTelemetry tracing -------------------------------------------------- diff --git a/docker/telemetry/xrpld-telemetry.cfg b/docker/telemetry/xrpld-telemetry.cfg index 453fe50439..a302faa2f5 100644 --- a/docker/telemetry/xrpld-telemetry.cfg +++ b/docker/telemetry/xrpld-telemetry.cfg @@ -109,6 +109,11 @@ docker/telemetry/data server=otel endpoint=http://localhost:4318/v1/metrics prefix=xrpld +# Sets the OTel service.instance.id resource attribute, which Prometheus +# exposes as the `exported_instance` label. Dashboards filter on it via the +# $node template variable, so without this every insight-backed panel is +# empty. Matches [telemetry] service_instance_id for a single node identity. +service_instance_id=xrpld-devnet # --- OpenTelemetry tracing -------------------------------------------------- From 80800ee1303da66a97ce5125a6f3ec81a2c5cc6a Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:37:09 +0100 Subject: [PATCH 03/16] use image-renderer in graphana Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- docker/telemetry/docker-compose.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docker/telemetry/docker-compose.yml b/docker/telemetry/docker-compose.yml index b296a0e1d9..b0d665f58c 100644 --- a/docker/telemetry/docker-compose.yml +++ b/docker/telemetry/docker-compose.yml @@ -108,7 +108,11 @@ services: - loki networks: - xrpld-telemetry - + renderer: + image: grafana/grafana-image-renderer:latest + environment: + GF_RENDERING_SERVER_URL: http://renderer:8081/render + GF_RENDERING_CALLBACK_URL: http://grafana:3000/ # Named volume for Tempo trace storage (WAL and compacted blocks). # Data persists across container restarts. Remove with: # docker compose -f docker/telemetry/docker-compose.yml down -v From e6643a43891d072945de2c6893163dd668473727 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:46:57 +0100 Subject: [PATCH 04/16] updated tags Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- docker/telemetry/docker-compose.yml | 16 +++++++++++++--- .../grafana/dashboards/consensus-health.json | 2 +- .../grafana/dashboards/ledger-operations.json | 2 +- .../grafana/dashboards/peer-network.json | 2 +- .../grafana/dashboards/rpc-performance.json | 2 +- .../dashboards/system-ledger-data-sync.json | 2 +- .../dashboards/system-network-traffic.json | 2 +- .../grafana/dashboards/system-node-health.json | 2 +- .../system-overlay-traffic-detail.json | 2 +- .../dashboards/system-rpc-pathfinding.json | 2 +- .../grafana/dashboards/transaction-overview.json | 2 +- .../grafana/dashboards/xrpld-fee-market.json | 2 +- .../grafana/dashboards/xrpld-job-queue.json | 2 +- .../grafana/dashboards/xrpld-peer-quality.json | 2 +- .../grafana/dashboards/xrpld-rpc-perf-otel.json | 2 +- .../dashboards/xrpld-validator-health.json | 2 +- 16 files changed, 28 insertions(+), 18 deletions(-) diff --git a/docker/telemetry/docker-compose.yml b/docker/telemetry/docker-compose.yml index b0d665f58c..becbbb811b 100644 --- a/docker/telemetry/docker-compose.yml +++ b/docker/telemetry/docker-compose.yml @@ -96,6 +96,11 @@ services: environment: - GF_AUTH_ANONYMOUS_ENABLED=true # No login required for local dev - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin # Full access without auth + # Remote image rendering: point Grafana at the renderer container. + # These belong on the grafana service (the server delegates renders); + # the callback URL is how the renderer fetches the panel from grafana. + - GF_RENDERING_SERVER_URL=http://renderer:8081/render + - GF_RENDERING_CALLBACK_URL=http://grafana:3000/ ports: - "3000:3000" # Grafana web UI volumes: @@ -106,13 +111,18 @@ services: - tempo - prometheus - loki + - renderer networks: - xrpld-telemetry + + # Grafana image renderer: a sidecar that renders panels/dashboards to PNG + # for image export and alerting. Grafana calls it at http://renderer:8081. renderer: image: grafana/grafana-image-renderer:latest - environment: - GF_RENDERING_SERVER_URL: http://renderer:8081/render - GF_RENDERING_CALLBACK_URL: http://grafana:3000/ + ports: + - "8081:8081" # Renderer HTTP endpoint (called by grafana) + networks: + - xrpld-telemetry # Named volume for Tempo trace storage (WAL and compacted blocks). # Data persists across container restarts. Remove with: # docker compose -f docker/telemetry/docker-compose.yml down -v diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 0c6914099f..0387d7f8ea 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -827,7 +827,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "consensus", "telemetry"], + "tags": ["xrpld", "consensus"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/ledger-operations.json b/docker/telemetry/grafana/dashboards/ledger-operations.json index 2ae55fe875..6bbd6d17d0 100644 --- a/docker/telemetry/grafana/dashboards/ledger-operations.json +++ b/docker/telemetry/grafana/dashboards/ledger-operations.json @@ -319,7 +319,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "ledger", "telemetry"], + "tags": ["xrpld", "ledger"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/peer-network.json b/docker/telemetry/grafana/dashboards/peer-network.json index 1ebea03c72..ff3bd53c93 100644 --- a/docker/telemetry/grafana/dashboards/peer-network.json +++ b/docker/telemetry/grafana/dashboards/peer-network.json @@ -153,7 +153,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "peer", "telemetry"], + "tags": ["xrpld", "peer"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json index d21e68b7be..45074c0eac 100644 --- a/docker/telemetry/grafana/dashboards/rpc-performance.json +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -418,7 +418,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "rpc", "telemetry"], + "tags": ["xrpld", "rpc"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/system-ledger-data-sync.json b/docker/telemetry/grafana/dashboards/system-ledger-data-sync.json index bdb62487b9..006155f672 100644 --- a/docker/telemetry/grafana/dashboards/system-ledger-data-sync.json +++ b/docker/telemetry/grafana/dashboards/system-ledger-data-sync.json @@ -493,7 +493,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "ledger", "sync", "telemetry"], + "tags": ["xrpld", "statsd", "ledger", "sync"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/system-network-traffic.json b/docker/telemetry/grafana/dashboards/system-network-traffic.json index ea4d708c74..9fb061b5a5 100644 --- a/docker/telemetry/grafana/dashboards/system-network-traffic.json +++ b/docker/telemetry/grafana/dashboards/system-network-traffic.json @@ -771,7 +771,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "network", "telemetry"], + "tags": ["xrpld", "statsd", "network"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/system-node-health.json b/docker/telemetry/grafana/dashboards/system-node-health.json index 29a1785419..8ee79ee498 100644 --- a/docker/telemetry/grafana/dashboards/system-node-health.json +++ b/docker/telemetry/grafana/dashboards/system-node-health.json @@ -2054,7 +2054,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "node-health", "telemetry"], + "tags": ["xrpld", "statsd", "node-health"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/system-overlay-traffic-detail.json b/docker/telemetry/grafana/dashboards/system-overlay-traffic-detail.json index 5009364ddc..496ec56c22 100644 --- a/docker/telemetry/grafana/dashboards/system-overlay-traffic-detail.json +++ b/docker/telemetry/grafana/dashboards/system-overlay-traffic-detail.json @@ -553,7 +553,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "overlay", "network", "telemetry"], + "tags": ["xrpld", "statsd", "overlay", "network"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json b/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json index 73cdeeae9e..de9c26d247 100644 --- a/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json +++ b/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json @@ -383,7 +383,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "rpc", "pathfinding", "telemetry"], + "tags": ["xrpld", "statsd", "rpc", "pathfinding"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index 292efeaede..d55f30393c 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -509,7 +509,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "transactions", "telemetry"], + "tags": ["xrpld", "transactions"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-fee-market.json b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json index 1e782bed1b..f4977bba91 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-fee-market.json +++ b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json @@ -306,7 +306,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "fee-market"], + "tags": ["xrpld", "fee-market"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-job-queue.json b/docker/telemetry/grafana/dashboards/xrpld-job-queue.json index 6d90b0769d..019f3c208b 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-job-queue.json +++ b/docker/telemetry/grafana/dashboards/xrpld-job-queue.json @@ -446,7 +446,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "job-queue"], + "tags": ["xrpld", "job-queue"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json b/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json index af0318e10b..6ffc83bcd9 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json +++ b/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json @@ -363,7 +363,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "peer", "network", "telemetry"], + "tags": ["xrpld", "peer", "network"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json b/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json index 2481b61215..b7fc34fde1 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json +++ b/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json @@ -394,7 +394,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "rpc"], + "tags": ["xrpld", "rpc"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-validator-health.json b/docker/telemetry/grafana/dashboards/xrpld-validator-health.json index bfb84e9e14..ee9d589c9c 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-validator-health.json +++ b/docker/telemetry/grafana/dashboards/xrpld-validator-health.json @@ -819,7 +819,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "validator", "health", "telemetry"], + "tags": ["xrpld", "validator", "health"], "templating": { "list": [ { From 4174aef07b0530cdd61313d133cc3d09e242cb79 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:29:45 +0100 Subject: [PATCH 05/16] fix(telemetry): align consensus_mode spanmetrics label with emitted attribute The spanmetrics connector dimension was `xrpl.consensus.mode`, but the code emits the span attribute under the bare key `consensus_mode` (matching every other dimension after the Phase 6 rename). The mismatch left the `xrpl_consensus_mode` Prometheus label empty, so the Consensus Health "Consensus Mode Over Time" panel and the `$consensus_mode` template variable (which filters every panel) matched no live series. - otel-collector-config.yaml: dimension `xrpl.consensus.mode` -> `consensus_mode` - consensus-health.json: 11 label refs `xrpl_consensus_mode` -> `consensus_mode` (the `$consensus_mode` Grafana variable name is unchanged) - telemetry-runbook.md: refresh the stale spanmetrics label table to the bare names actually emitted (command/rpc_status/consensus_mode/local/ proposal_trusted/validation_trusted), fix dotted->bare attribute names in span tables and TraceQL examples (tx_hash, ledger_seq, consensus_round_id, consensus_ledger_id, consensus_round, tx_id event attr), correct the consensus_round_id query to int (not quoted string), and fix the load_type value query ("exception_rpc" -> "exceptioned RPC"). Verified against the live stack: Tempo span tags confirm bare attribute keys (consensus_mode, ledger_seq, tx_hash, ...); the populated xrpl_consensus_mode series in Prometheus is stale retained data from an older build. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 22 +-- docker/telemetry/otel-collector-config.yaml | 2 +- docs/telemetry-runbook.md | 186 +++++++++--------- 3 files changed, 105 insertions(+), 105 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 88a7143469..318998718f 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -29,14 +29,14 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))", + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))", "legendFormat": "P95 Round Duration [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))", + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))", "legendFormat": "P50 Round Duration [{{exported_instance}}]" } ], @@ -75,7 +75,7 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.proposal.send\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.proposal.send\"}[5m]))", "legendFormat": "Proposals / Sec [{{exported_instance}}]" } ], @@ -114,7 +114,7 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m])))", + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m])))", "legendFormat": "P95 Close Duration [{{exported_instance}}]" } ], @@ -153,7 +153,7 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.validation.send\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.validation.send\"}[5m]))", "legendFormat": "Validations / Sec [{{exported_instance}}]" } ], @@ -179,14 +179,14 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))", + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))", "legendFormat": "P95 Apply Duration [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))", + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))", "legendFormat": "P50 Apply Duration [{{exported_instance}}]" } ], @@ -219,7 +219,7 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", xrpl_consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))", + "expr": "sum by (close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))", "legendFormat": "Close Time Correct={{close_time_correct}} [{{exported_instance}}]" } ], @@ -258,8 +258,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_consensus_mode, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m]))", - "legendFormat": "{{xrpl_consensus_mode}} [{{exported_instance}}]" + "expr": "sum by (consensus_mode, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m]))", + "legendFormat": "{{consensus_mode}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -773,7 +773,7 @@ "label": "Consensus Mode", "description": "Filter by consensus mode (Proposing, Observing, Wrong Ledger, Switched Ledger)", "type": "query", - "query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, xrpl_consensus_mode)", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, consensus_mode)", "datasource": { "type": "prometheus", "uid": "prometheus" diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index fd539f5462..60b01388db 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -51,7 +51,7 @@ connectors: dimensions: - name: command - name: rpc_status - - name: xrpl.consensus.mode + - name: consensus_mode - name: close_time_correct - name: local - name: suppressed diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index db3eff7bad..b7fc0605b3 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -74,47 +74,47 @@ All spans instrumented in xrpld, grouped by subsystem: ### Transaction Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------ | --------------- | -------------------------------------------------------------------------------------- | ------------------------------------- | -| `tx.process` | NetworkOPs.cpp | `xrpl.tx.hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing | -| `tx.receive` | PeerImp.cpp | `xrpl.peer.id`, `xrpl.tx.hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | -| `tx.apply` | BuildLedger.cpp | `xrpl.ledger.seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | +| Span Name | Source File | Attributes | Description | +| ------------ | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- | +| `tx.process` | NetworkOPs.cpp | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing | +| `tx.receive` | PeerImp.cpp | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | +| `tx.apply` | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | ### Transaction Queue Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------------ | ----------- | ------------------------------------------------------------- | -------------------------------------------------- | -| `txq.enqueue` | TxQ.cpp | `xrpl.tx.hash`, `tx_type` | Transaction enqueue decision (child of tx.process) | -| `txq.apply_direct` | TxQ.cpp | -- | Direct apply attempt (bypassing queue) | -| `txq.batch_clear` | TxQ.cpp | -- | Batch clear of queued transactions for an account | -| `txq.accept` | TxQ.cpp | `queue_size`, `ledger_changed` | Ledger-close accept loop over queued transactions | -| `txq.accept_tx` | TxQ.cpp | `xrpl.tx.hash`, `retries_remaining`, `ter_code`, `txq_status` | Per-transaction apply during accept | -| `txq.cleanup` | TxQ.cpp | `xrpl.ledger.seq` | Post-close cleanup of expired queue entries | +| Span Name | Source File | Attributes | Description | +| ------------------ | ----------- | -------------------------------------------------------- | -------------------------------------------------- | +| `txq.enqueue` | TxQ.cpp | `tx_hash`, `tx_type` | Transaction enqueue decision (child of tx.process) | +| `txq.apply_direct` | TxQ.cpp | -- | Direct apply attempt (bypassing queue) | +| `txq.batch_clear` | TxQ.cpp | -- | Batch clear of queued transactions for an account | +| `txq.accept` | TxQ.cpp | `queue_size`, `ledger_changed` | Ledger-close accept loop over queued transactions | +| `txq.accept_tx` | TxQ.cpp | `tx_hash`, `retries_remaining`, `ter_code`, `txq_status` | Per-transaction apply during accept | +| `txq.cleanup` | TxQ.cpp | `ledger_seq` | Post-close cleanup of expired queue entries | ### Consensus Spans (Phase 4) -| Span Name | Source File | Attributes | Description | -| ------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `consensus.round` | RCLConsensus.cpp | `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`, `xrpl.consensus.mode`, `trace_strategy`, `xrpl.consensus.round_id` | Root span for a consensus round (deterministic or random trace ID) | -| `consensus.phase.open` | Consensus.h | -- | Open phase duration (child of round) | -| `consensus.proposal.send` | RCLConsensus.cpp | `xrpl.consensus.round`, `is_bow_out` | Consensus proposal broadcast | -| `consensus.ledger_close` | RCLConsensus.cpp | `xrpl.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | -| `consensus.establish` | Consensus.h | `converge_percent`, `establish_count`, `proposers` | Establish phase duration (child of round) | -| `consensus.update_positions` | Consensus.h | `converge_percent`, `proposers`, `disputes_count` | Position update and dispute resolution (see Events below) | -| `consensus.check` | Consensus.h | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `consensus_result` | Consensus threshold check | -| `consensus.accept` | RCLConsensus.cpp | `proposers`, `round_time_ms`, `quorum`, `disputes_count`, `consensus_state` | Ledger accepted by consensus | -| `consensus.accept.apply` | RCLConsensus.cpp | `xrpl.ledger.seq`, `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction`, `tx_count` | Ledger application with close time details (see Events below) | -| `consensus.validation.send` | RCLConsensus.cpp | `xrpl.ledger.seq`, `proposing`, `ledger_hash`, `full_validation`, `validation_sign_time` | Validation sent after accept (follows-from link) | -| `consensus.mode_change` | RCLConsensus.cpp | `mode_old`, `mode_new` | Consensus mode transition | -| `consensus.proposal.receive` | PeerImp.cpp | `trusted`, `xrpl.consensus.round` | Proposal received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | -| `consensus.validation.receive` | PeerImp.cpp | `trusted`, `xrpl.ledger.seq` | Validation received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | +| Span Name | Source File | Attributes | Description | +| ------------------------------ | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `consensus.round` | RCLConsensus.cpp | `consensus_ledger_id`, `ledger_seq`, `consensus_mode`, `trace_strategy`, `consensus_round_id` | Root span for a consensus round (deterministic or random trace ID) | +| `consensus.phase.open` | Consensus.h | -- | Open phase duration (child of round) | +| `consensus.proposal.send` | RCLConsensus.cpp | `consensus_round`, `is_bow_out` | Consensus proposal broadcast | +| `consensus.ledger_close` | RCLConsensus.cpp | `ledger_seq`, `consensus_mode` | Ledger close event | +| `consensus.establish` | Consensus.h | `converge_percent`, `establish_count`, `proposers` | Establish phase duration (child of round) | +| `consensus.update_positions` | Consensus.h | `converge_percent`, `proposers`, `disputes_count` | Position update and dispute resolution (see Events below) | +| `consensus.check` | Consensus.h | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `consensus_result` | Consensus threshold check | +| `consensus.accept` | RCLConsensus.cpp | `proposers`, `round_time_ms`, `quorum`, `disputes_count`, `consensus_state` | Ledger accepted by consensus | +| `consensus.accept.apply` | RCLConsensus.cpp | `ledger_seq`, `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction`, `tx_count` | Ledger application with close time details (see Events below) | +| `consensus.validation.send` | RCLConsensus.cpp | `ledger_seq`, `proposing`, `ledger_hash`, `full_validation`, `validation_sign_time` | Validation sent after accept (follows-from link) | +| `consensus.mode_change` | RCLConsensus.cpp | `mode_old`, `mode_new` | Consensus mode transition | +| `consensus.proposal.receive` | PeerImp.cpp | `trusted`, `consensus_round` | Proposal received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | +| `consensus.validation.receive` | PeerImp.cpp | `trusted`, `ledger_seq` | Validation received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | #### Consensus Span Events -| Parent Span | Event Name | Event Attributes | Description | -| ---------------------------- | ----------------- | ---------------------------------------------------------------- | ------------------------------------------------------- | -| `consensus.update_positions` | `dispute.resolve` | `xrpl.tx.id`, `dispute_our_vote`, `dispute_yays`, `dispute_nays` | Emitted per dispute when votes are tallied | -| `consensus.accept.apply` | `tx.included` | `xrpl.tx.id` | Emitted per transaction included in the accepted ledger | +| Parent Span | Event Name | Event Attributes | Description | +| ---------------------------- | ----------------- | ----------------------------------------------------------- | ------------------------------------------------------- | +| `consensus.update_positions` | `dispute.resolve` | `tx_id`, `dispute_our_vote`, `dispute_yays`, `dispute_nays` | Emitted per dispute when votes are tallied | +| `consensus.accept.apply` | `tx.included` | `tx_id` | Emitted per transaction included in the accepted ledger | #### Close Time Queries (Tempo TraceQL) @@ -129,10 +129,10 @@ All spans instrumented in xrpld, grouped by subsystem: {name="consensus.accept.apply"} | duration > 5s # Find specific ledger's consensus details -{name="consensus.accept.apply"} | xrpl.ledger.seq = 92345678 +{name="consensus.accept.apply"} | ledger_seq = 92345678 # Find all spans in a consensus round (deterministic trace strategy) -{name="consensus.round"} | xrpl.consensus.round_id = "" +{name="consensus.round"} | consensus_round_id = # Find dispute resolutions {name="consensus.update_positions"} >> {event:name="dispute.resolve"} @@ -140,18 +140,18 @@ All spans instrumented in xrpld, grouped by subsystem: ### Ledger Spans (Phase 6) -| Span Name | Source File | Attributes | Description | -| ----------------- | -------------------- | ------------------------------------------ | ----------------------------- | -| `ledger.build` | BuildLedger.cpp:31 | `xrpl.ledger.seq`, `tx_count`, `tx_failed` | Ledger build during consensus | -| `ledger.validate` | LedgerMaster.cpp:915 | `xrpl.ledger.seq`, `validations` | Ledger promoted to validated | -| `ledger.store` | LedgerMaster.cpp:409 | `xrpl.ledger.seq` | Ledger stored in history | +| Span Name | Source File | Attributes | Description | +| ----------------- | -------------------- | ------------------------------------- | ----------------------------- | +| `ledger.build` | BuildLedger.cpp:31 | `ledger_seq`, `tx_count`, `tx_failed` | Ledger build during consensus | +| `ledger.validate` | LedgerMaster.cpp:915 | `ledger_seq`, `validations` | Ledger promoted to validated | +| `ledger.store` | LedgerMaster.cpp:409 | `ledger_seq` | Ledger stored in history | ### Peer Spans (Phase 6) -| Span Name | Source File | Attributes | Description | -| ------------------------- | ---------------- | ------------------------------------ | ----------------------------- | -| `peer.proposal.receive` | PeerImp.cpp:1667 | `xrpl.peer.id`, `proposal_trusted` | Proposal received from peer | -| `peer.validation.receive` | PeerImp.cpp:2264 | `xrpl.peer.id`, `validation_trusted` | Validation received from peer | +| Span Name | Source File | Attributes | Description | +| ------------------------- | ---------------- | ------------------------------- | ----------------------------- | +| `peer.proposal.receive` | PeerImp.cpp:1667 | `peer_id`, `proposal_trusted` | Proposal received from peer | +| `peer.validation.receive` | PeerImp.cpp:2264 | `peer_id`, `validation_trusted` | Validation received from peer | --- @@ -209,7 +209,7 @@ This section shows what questions you can answer using the span attributes, with {name="rpc.http_request"} | request_payload_size > 100000 # Find resource-heavy RPC commands (by load_type) -{name=~"rpc.command.*"} | load_type = "exception_rpc" +{name=~"rpc.command.*"} | load_type = "exceptioned RPC" # Find a specific WebSocket command {name="rpc.ws_message"} | command = "subscribe" @@ -355,10 +355,10 @@ all its normal attributes, it just lacks a cross-node parent link. {name="consensus.proposal.receive"} && nestedSetParent > 0 # Trace a transaction across the network by its hash -{name=~"tx\\..*"} | xrpl.tx.hash = "" +{name=~"tx\\..*"} | tx_hash = "" # Find all spans in a cross-node consensus trace -{rootServiceName="xrpld"} | xrpl.consensus.round_id = "" +{rootServiceName="xrpld"} | consensus_round_id = # Compare latency between sender and receiver for validations {name="consensus.validation.send" || name="consensus.validation.receive"} @@ -388,16 +388,16 @@ Every metric carries these standard labels: | `service_name` | Resource attribute | `xrpld` | | `span_kind` | Span kind | `SPAN_KIND_INTERNAL` | -Additionally, span attributes configured as dimensions in the collector become metric labels (dots → underscores): +Additionally, span attributes configured as dimensions in the collector become metric labels. The collector dimensions use the bare attribute keys emitted by the code, so the label name equals the attribute name: -| Span Attribute | Metric Label | Applies To | -| --------------------- | ------------------------------ | ------------------------------- | -| `command` | `xrpl_rpc_command` | `rpc.command.*` spans | -| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` spans | -| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` spans | -| `local` | `xrpl_tx_local` | `tx.process` spans | -| `proposal_trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` spans | -| `validation_trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` spans | +| Span Attribute | Metric Label | Applies To | +| -------------------- | -------------------- | ------------------------------- | +| `command` | `command` | `rpc.command.*` spans | +| `rpc_status` | `rpc_status` | `rpc.command.*` spans | +| `consensus_mode` | `consensus_mode` | `consensus.ledger_close` spans | +| `local` | `local` | `tx.process` spans | +| `proposal_trusted` | `proposal_trusted` | `peer.proposal.receive` spans | +| `validation_trusted` | `validation_trusted` | `peer.validation.receive` spans | ### Histogram Buckets @@ -467,44 +467,44 @@ Ten dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: ### RPC Performance (`xrpld-rpc-perf`) -| Panel | Type | PromQL | Labels Used | -| --------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------- | -| RPC Request Rate by Command | timeseries | `sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~"rpc.command.*"}[5m]))` | `xrpl_rpc_command` | -| RPC Latency p95 by Command | timeseries | `histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])))` | `xrpl_rpc_command` | -| RPC Error Rate | bargauge | Error spans / total spans × 100, grouped by `xrpl_rpc_command` | `xrpl_rpc_command`, `status_code` | -| RPC Latency Heatmap | heatmap | `sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])) by (le)` | `le` (bucket boundaries) | -| Overall RPC Throughput | timeseries | `rpc.request` + `rpc.process` rate | — | -| RPC Success vs Error | timeseries | by `status_code` (UNSET vs ERROR) | `status_code` | -| Top Commands by Volume | bargauge | `topk(10, ...)` by `xrpl_rpc_command` | `xrpl_rpc_command` | -| WebSocket Message Rate | stat | `rpc.ws_message` rate | — | +| Panel | Type | PromQL | Labels Used | +| --------------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------ | +| RPC Request Rate by Command | timeseries | `sum by (command) (rate(traces_span_metrics_calls_total{span_name=~"rpc.command.*"}[5m]))` | `command` | +| RPC Latency p95 by Command | timeseries | `histogram_quantile(0.95, sum by (le, command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])))` | `command` | +| RPC Error Rate | bargauge | Error spans / total spans × 100, grouped by `command` | `command`, `status_code` | +| RPC Latency Heatmap | heatmap | `sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])) by (le)` | `le` (bucket boundaries) | +| Overall RPC Throughput | timeseries | `rpc.request` + `rpc.process` rate | — | +| RPC Success vs Error | timeseries | by `status_code` (UNSET vs ERROR) | `status_code` | +| Top Commands by Volume | bargauge | `topk(10, ...)` by `command` | `command` | +| WebSocket Message Rate | stat | `rpc.ws_message` rate | — | ### Transaction Overview (`xrpld-transactions`) -| Panel | Type | PromQL | Labels Used | -| --------------------------------- | ---------- | -------------------------------------------------------------------------------------------- | --------------- | -| Transaction Processing Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m])` and `tx.receive` | `span_name` | -| Transaction Processing Latency | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="tx.process"})` | — | -| Transaction Path Distribution | piechart | `sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m]))` | `xrpl_tx_local` | -| Transaction Receive vs Suppressed | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.receive"}[5m])` | — | -| TX Processing Duration Heatmap | heatmap | `tx.process` histogram buckets | `le` | -| TX Apply Duration per Ledger | timeseries | p95/p50 of `tx.apply` | — | -| Peer TX Receive Rate | timeseries | `tx.receive` rate | — | -| TX Apply Failed Rate | stat | `tx.apply` with `STATUS_CODE_ERROR` | `status_code` | +| Panel | Type | PromQL | Labels Used | +| --------------------------------- | ---------- | ------------------------------------------------------------------------------------ | ------------- | +| Transaction Processing Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m])` and `tx.receive` | `span_name` | +| Transaction Processing Latency | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="tx.process"})` | — | +| Transaction Path Distribution | piechart | `sum by (local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m]))` | `local` | +| Transaction Receive vs Suppressed | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.receive"}[5m])` | — | +| TX Processing Duration Heatmap | heatmap | `tx.process` histogram buckets | `le` | +| TX Apply Duration per Ledger | timeseries | p95/p50 of `tx.apply` | — | +| Peer TX Receive Rate | timeseries | `tx.receive` rate | — | +| TX Apply Failed Rate | stat | `tx.apply` with `STATUS_CODE_ERROR` | `status_code` | ### Consensus Health (`xrpld-consensus`) -| Panel | Type | PromQL | Labels Used | -| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | --------------------- | -| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — | -| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — | -| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — | -| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — | -| Ledger Apply Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept.apply"})` | — | -| Close Time Agreement | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.accept.apply"}[5m])` | — | -| Consensus Mode Over Time | timeseries | `consensus.ledger_close` by `xrpl_consensus_mode` | `xrpl_consensus_mode` | -| Accept vs Close Rate | timeseries | `consensus.accept` vs `consensus.ledger_close` rate | — | -| Validation vs Close Rate | timeseries | `consensus.validation.send` vs `consensus.ledger_close` | — | -| Accept Duration Heatmap | heatmap | `consensus.accept` histogram buckets | `le` | +| Panel | Type | PromQL | Labels Used | +| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | ---------------- | +| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — | +| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — | +| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — | +| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — | +| Ledger Apply Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept.apply"})` | — | +| Close Time Agreement | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.accept.apply"}[5m])` | — | +| Consensus Mode Over Time | timeseries | `consensus.ledger_close` by `consensus_mode` | `consensus_mode` | +| Accept vs Close Rate | timeseries | `consensus.accept` vs `consensus.ledger_close` rate | — | +| Validation vs Close Rate | timeseries | `consensus.validation.send` vs `consensus.ledger_close` | — | +| Accept Duration Heatmap | heatmap | `consensus.accept` histogram buckets | `le` | ### Ledger Operations (`xrpld-ledger-ops`) @@ -523,12 +523,12 @@ Ten dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: Requires `trace_peer=1` in the `[telemetry]` config section. -| Panel | Type | PromQL | Labels Used | -| -------------------------------- | ---------- | --------------------------------- | ------------------------------ | -| Proposal Receive Rate | timeseries | `peer.proposal.receive` rate | — | -| Validation Receive Rate | timeseries | `peer.validation.receive` rate | — | -| Proposals Trusted vs Untrusted | piechart | by `xrpl_peer_proposal_trusted` | `xrpl_peer_proposal_trusted` | -| Validations Trusted vs Untrusted | piechart | by `xrpl_peer_validation_trusted` | `xrpl_peer_validation_trusted` | +| Panel | Type | PromQL | Labels Used | +| -------------------------------- | ---------- | ------------------------------ | -------------------- | +| Proposal Receive Rate | timeseries | `peer.proposal.receive` rate | — | +| Validation Receive Rate | timeseries | `peer.validation.receive` rate | — | +| Proposals Trusted vs Untrusted | piechart | by `proposal_trusted` | `proposal_trusted` | +| Validations Trusted vs Untrusted | piechart | by `validation_trusted` | `validation_trusted` | ### Node Health -- StatsD (`xrpld-statsd-node-health`) From 63c6f3b8df98ffa13e0125642ea3a9ca078841b5 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:37:29 +0100 Subject: [PATCH 06/16] feat(telemetry): surface consensus + TxQ lifecycle spans in dashboards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The consensus state-machine and TxQ lifecycle spans are emitted by the code and present in Prometheus, but no panel visualised them. Add panels keyed on those span_names (verified live) plus the low-cardinality dimensions needed to break them down. Consensus Health (consensus-health.json) — new rows: - Consensus Round Duration (full round, p95/p50, mode-filterable) - Consensus Phase Duration (open vs establish breakdown) - Position Update Duration (update_positions p95/p50) - Consensus Stall Rate (consensus.check by consensus_stalled) - Consensus Mode-Change Rate by Target Mode (mode_change by mode_new) Transaction Overview (transaction-overview.json) — new rows: - TxQ Enqueue Rate by Transaction Type (txq.enqueue by tx_type) - Queue Bypass Ratio (txq.apply_direct vs txq.enqueue) - Queue Accept (Drain) Duration per Ledger (txq.accept p95/p50) - Queue Cleanup Rate (txq.cleanup expired entries) otel-collector-config.yaml — add spanmetrics dimensions for the lifecycle breakdowns: mode_new, consensus_stalled, consensus_phase, consensus_result (all bounded value sets, safe as Prometheus labels). All new panels follow the existing dashboard template: $node filter, exported_instance in every legend, Title Case, axis labels, row layout. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 223 ++++++++++++++++++ .../dashboards/transaction-overview.json | 163 +++++++++++++ docker/telemetry/otel-collector-config.yaml | 5 + 3 files changed, 391 insertions(+) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 318998718f..1b75ce86d0 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -742,6 +742,229 @@ }, "overrides": [] } + }, + { + "title": "Consensus Round Duration (Full Round)", + "description": "p95/p50 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 72 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))", + "legendFormat": "P95 Round [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))", + "legendFormat": "P50 Round [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Phase Duration (Open vs Establish)", + "description": "p95 duration of the open phase (transaction collection) vs the establish phase (proposal convergence). The consensus.phase.open and consensus.establish spans decompose round latency, so an operator can tell whether slowness is in collecting transactions or reaching agreement.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 72 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.phase.open\"}[5m])))", + "legendFormat": "P95 Open Phase [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.establish\"}[5m])))", + "legendFormat": "P95 Establish Phase [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Position Update Duration", + "description": "p95/p50 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 80 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))", + "legendFormat": "P95 Update [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))", + "legendFormat": "P50 Update [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Stall Rate", + "description": "Rate of consensus.check spans reporting consensus_stalled=true, broken down by stall flag. A non-zero stalled rate surfaces stall conditions before they manifest as validated-ledger-age alarms. Requires the consensus_stalled spanmetrics dimension.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 80 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.check\", consensus_stalled=\"true\"}[5m]))", + "legendFormat": "Stalled [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.check\", consensus_stalled=\"false\"}[5m]))", + "legendFormat": "Not Stalled [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Checks / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Mode-Change Rate by Target Mode", + "description": "Rate of consensus.mode_change spans broken down by the mode the node switched INTO (mode_new). Frequent switches into Wrong Ledger or Switched Ledger indicate an unstable node at fork risk. Requires the mode_new spanmetrics dimension.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 88 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (mode_new, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.mode_change\"}[5m]))", + "legendFormat": "{{mode_new}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Mode Changes / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index 8b11816959..8f08c6587c 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -506,6 +506,169 @@ }, "overrides": [] } + }, + { + "title": "TxQ Enqueue Rate by Transaction Type", + "description": "Rate of txq.enqueue spans broken down by transaction type (tx_type). Shows what share of inbound demand is Payment vs OfferCreate vs other transactors, and how the mix shifts as the queue fills. A spam burst of one type is a leading indicator of fee escalation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (tx_type, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.enqueue\"}[5m]))", + "legendFormat": "{{tx_type}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Enqueues / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Bypass Ratio (Direct Apply vs Enqueue)", + "description": "Ratio of transactions that applied directly to the open ledger (txq.apply_direct) versus those that had to be queued (txq.enqueue). A falling bypass ratio is the cleanest single signal the network has entered sustained fee escalation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.apply_direct\"}[5m])) / clamp_min(sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.apply_direct\"}[5m])) + sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.enqueue\"}[5m])), 1)", + "legendFormat": "Direct-Apply Fraction [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "custom": { + "axisLabel": "Bypass Fraction", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Accept (Drain) Duration per Ledger", + "description": "p95/p50 duration of the txq.accept span, which drains queued transactions into a newly closed ledger. Rising drain time signals queue pressure at ledger close.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))", + "legendFormat": "P95 Drain [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))", + "legendFormat": "P50 Drain [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Cleanup Rate (Expired Entries)", + "description": "Rate of txq.cleanup spans, which remove expired transactions from the queue each ledger. A rising rate means submitters under-bid the escalating fee and abandoned their transactions \u2014 a demand-frustration signal distinct from acceptance throughput.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.cleanup\"}[5m]))", + "legendFormat": "Cleanups / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Cleanups / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index 60b01388db..36112253b8 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -63,6 +63,11 @@ connectors: - name: consensus_state - name: load_type - name: is_batch + # Consensus lifecycle dimensions (low cardinality, bounded value sets). + - name: mode_new + - name: consensus_stalled + - name: consensus_phase + - name: consensus_result exporters: debug: From 000ad1d1f50d3643eea96f8a9a0c9e06ec1856ec Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:40:07 +0100 Subject: [PATCH 07/16] feat(telemetry): add gRPC and pathfinding span panels (RPC dashboard) The grpc.{Method} spans (GRPCServer.cpp) and pathfind.* spans (PathRequest.cpp) are emitted but had no dashboard coverage. The existing RPC & Pathfinding dashboard only plotted StatsD timers. Add span-derived rows: - gRPC Request Rate by Method (grpc.* by method) - gRPC Latency P95 by Method - gRPC Error Rate by Status (by grpc_status) - Pathfinding Compute Duration (pathfind.compute p95/p50) - Pathfinding Request & Discovery Rate (pathfind.request / pathfind.discover) otel-collector-config.yaml: add method, grpc_role, grpc_status spanmetrics dimensions (bounded value sets). Add a $grpc_method template variable so the gRPC panels can be filtered by method, consistent with the dashboard filter conventions. Note: these spans populate only when the node serves gRPC / pathfinding traffic; they are correct but not exercised by the current health-check workload (they will be covered by the Phase 10 workload generator). Co-Authored-By: Claude Opus 4.8 --- .../dashboards/system-rpc-pathfinding.json | 229 ++++++++++++++++++ docker/telemetry/otel-collector-config.yaml | 4 + 2 files changed, 233 insertions(+) diff --git a/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json b/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json index 36ec7b3dd0..d2e001e1cc 100644 --- a/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json +++ b/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json @@ -380,6 +380,215 @@ }, "overrides": [] } + }, + { + "title": "gRPC Request Rate by Method (Spans)", + "description": "Per-method gRPC call rate derived from the grpc.{Method} spans (GRPCServer.cpp). Covers the gRPC API used by reporting/Clio. Populated only when the node serves gRPC traffic.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (method, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", method=~\"$grpc_method\", span_name=~\"grpc\\\\..*\"}[5m]))", + "legendFormat": "{{method}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Calls / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "gRPC Latency P95 by Method (Spans)", + "description": "p95 latency per gRPC method from grpc.{Method} span durations. Identifies slow gRPC read paths.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, method, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", method=~\"$grpc_method\", span_name=~\"grpc\\\\..*\"}[5m])))", + "legendFormat": "{{method}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "gRPC Error Rate by Status (Spans)", + "description": "Rate of gRPC spans broken down by grpc_status (success/error/resource_exhausted/failed_precondition). A rising error or resource_exhausted rate indicates gRPC clients hitting limits.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (grpc_status, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"grpc\\\\..*\", grpc_status!=\"\"}[5m]))", + "legendFormat": "{{grpc_status}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Calls / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Pathfinding Compute Duration (Spans)", + "description": "p95/p50 of the pathfind.compute span, the per-request path computation. Complements the StatsD pathfind_fast/full timers with span-level visibility. Populated under pathfinding (book/path) RPC load.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))", + "legendFormat": "P95 Compute [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))", + "legendFormat": "P50 Compute [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Pathfinding Request & Discovery Rate (Spans)", + "description": "Rate of pathfind.request (client path requests) and pathfind.discover (path-discovery passes) spans. Shows pathfinding demand and the discovery cost driver for subscription-heavy nodes.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 48 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"pathfind.request\"}[5m]))", + "legendFormat": "Requests / Sec [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"pathfind.discover\"}[5m]))", + "legendFormat": "Discoveries / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Operations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, @@ -405,6 +614,26 @@ "multi": true, "refresh": 2, "sort": 1 + }, + { + "name": "grpc_method", + "label": "gRPC Method", + "description": "Filter by gRPC method (GetLedger, GetLedgerData, GetLedgerDiff, GetLedgerEntry)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total{span_name=~\"grpc\\\\..*\"}, method)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 } ] }, diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index bf9b82b402..e83ed6510e 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -50,6 +50,10 @@ connectors: - name: consensus_stalled - name: consensus_phase - name: consensus_result + # gRPC surface dimensions (bounded: method names, role, status). + - name: method + - name: grpc_role + - name: grpc_status exporters: debug: From b286335ccf488e49161e1d6b65245eb1bec43856 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:44:07 +0100 Subject: [PATCH 08/16] feat(telemetry): add load-factor attribution and 7-day agreement panels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both metrics are already emitted and live in Prometheus but were not fully visualised. - Fee Market (xrpld-fee-market.json): "Load Factor Attribution (Stacked Components)" — stacks load_factor_fee_escalation / fee_queue / local / net / cluster so an operator can see which component drives the effective fee. The existing panels showed the aggregate only. - Validator Health (xrpld-validator-health.json): "Agreement % (7d)" and "Agreements vs Missed (7d)" — the xrpld_validation_agreement gauge already observes agreement_pct_7d / agreements_7d / missed_7d, but the dashboard only plotted 1h and 24h windows. Panels follow the existing template: $node filter, exported_instance in legends, Title Case, axis labels. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/xrpld-fee-market.json | 72 ++++++++++++++ .../dashboards/xrpld-validator-health.json | 97 +++++++++++++++++++ 2 files changed, 169 insertions(+) diff --git a/docker/telemetry/grafana/dashboards/xrpld-fee-market.json b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json index f4977bba91..af3225a98a 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-fee-market.json +++ b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json @@ -303,6 +303,78 @@ }, "overrides": [] } + }, + { + "title": "Load Factor Attribution (Stacked Components)", + "description": "Stacked contribution of each load-factor component (fee escalation, queue, local, net, cluster) to the effective transaction cost. Shows WHICH component is driving the fee at any moment, which the aggregate load_factor hides.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_load_factor_metrics{metric=\"load_factor_fee_escalation\",exported_instance=~\"$node\"}", + "legendFormat": "Fee Escalation [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_load_factor_metrics{metric=\"load_factor_fee_queue\",exported_instance=~\"$node\"}", + "legendFormat": "Fee Queue [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_load_factor_metrics{metric=\"load_factor_local\",exported_instance=~\"$node\"}", + "legendFormat": "Local [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_load_factor_metrics{metric=\"load_factor_net\",exported_instance=~\"$node\"}", + "legendFormat": "Net [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_load_factor_metrics{metric=\"load_factor_cluster\",exported_instance=~\"$node\"}", + "legendFormat": "Cluster [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Load Factor Multiplier", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3, + "stacking": { + "mode": "normal", + "group": "A" + }, + "fillOpacity": 30 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/xrpld-validator-health.json b/docker/telemetry/grafana/dashboards/xrpld-validator-health.json index ee9d589c9c..439eb1bf43 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-validator-health.json +++ b/docker/telemetry/grafana/dashboards/xrpld-validator-health.json @@ -816,6 +816,103 @@ }, "overrides": [] } + }, + { + "title": "Agreement % (7d)", + "description": "Validation agreement percentage over the trailing 7 days \u2014 the long-term reliability window used by external validator dashboards. Complements the 1h/24h stats.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 43 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_validation_agreement{metric=\"agreement_pct_7d\",exported_instance=~\"$node\"}", + "legendFormat": "Agreement 7d [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "green", + "value": 95 + } + ] + }, + "custom": {} + }, + "overrides": [] + } + }, + { + "title": "Agreements vs Missed (7d)", + "description": "Agreed vs missed validation counts over the trailing 7 days. A rising missed trend signals sustained validator unreliability.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 43 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_validation_agreement{metric=\"agreements_7d\",exported_instance=~\"$node\"}", + "legendFormat": "Agreements 7d [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_validation_agreement{metric=\"missed_7d\",exported_instance=~\"$node\"}", + "legendFormat": "Missed 7d [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Count", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, From d7baf262f87e5856677190f85f8631a7656938f3 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:51:52 +0100 Subject: [PATCH 09/16] fix(telemetry): remove duplicate consensus outcome/failures panels A phase-8->phase-9 merge (a675897aaf) duplicated the "Consensus Outcome Distribution" and "Consensus Failures Over Time" panels: both appeared twice with byte-identical queries (verified ignoring gridPos). The pair existed once on phase-6/7/8 and became two on phase-9 only, so the duplication originated in phase-9's own merge history. Remove the second (lower) copy of each and re-stack panel y-positions with no gaps. The single retained copy keeps the original y=64 row. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 92 +------------------ 1 file changed, 5 insertions(+), 87 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 1f562e5802..d9bbcaeb67 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -743,88 +743,6 @@ "overrides": [] } }, - { - "title": "Consensus Outcome Distribution", - "description": "Distribution of consensus.accept outcomes: yes (normal), moved_on (without full agreement), expired (timeout). Non-yes outcomes indicate network stress.", - "type": "piechart", - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 72 - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "values": ["value", "percent"] - }, - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "sum by (consensus_state) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state!=\"\"}[5m]))", - "legendFormat": "{{consensus_state}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "short" - }, - "overrides": [] - } - }, - { - "title": "Consensus Failures Over Time", - "description": "Rate of non-normal consensus outcomes (moved_on + expired). Spikes indicate consensus instability.", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 16, - "x": 8, - "y": 72 - }, - "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"moved_on\"}[5m]))", - "legendFormat": "moved_on [{{exported_instance}}]" - }, - { - "datasource": { - "type": "prometheus" - }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"expired\"}[5m]))", - "legendFormat": "expired [{{exported_instance}}]" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ops", - "custom": { - "axisLabel": "Failures / Sec", - "spanNulls": true, - "insertNulls": false, - "showPoints": "auto", - "pointSize": 3 - } - }, - "overrides": [] - } - }, { "title": "Consensus Round Duration (Full Round)", "description": "p95/p50 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.", @@ -833,7 +751,7 @@ "h": 8, "w": 12, "x": 0, - "y": 80 + "y": 72 }, "options": { "tooltip": { @@ -879,7 +797,7 @@ "h": 8, "w": 12, "x": 12, - "y": 80 + "y": 72 }, "options": { "tooltip": { @@ -925,7 +843,7 @@ "h": 8, "w": 12, "x": 0, - "y": 88 + "y": 80 }, "options": { "tooltip": { @@ -971,7 +889,7 @@ "h": 8, "w": 12, "x": 12, - "y": 88 + "y": 80 }, "options": { "tooltip": { @@ -1017,7 +935,7 @@ "h": 8, "w": 24, "x": 0, - "y": 96 + "y": 88 }, "options": { "tooltip": { From d3955d363943183a048c54d9cae0d533e4bfc00e Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:53:53 +0100 Subject: [PATCH 10/16] fix(telemetry): emit real diverged-peer count for peers_insane_count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xrpld_peer_quality{metric="peers_insane_count"} gauge was hardcoded to 0.0 with a TODO, leaving the "Insane/Diverged Peers" panel permanently empty. PeerImp::json() already exposes the peer's tracking state via the "track" field (set to "diverged" when tracking_ == Tracking::Diverged). The peer-quality callback already iterates peer->json() for latency and version, so count peers whose "track" field equals "diverged" in the same loop — no change to the abstract Peer interface required. Co-Authored-By: Claude Opus 4.8 --- src/xrpld/telemetry/MetricsRegistry.cpp | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp index 79e76ea737..b7f30b6004 100644 --- a/src/xrpld/telemetry/MetricsRegistry.cpp +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -1000,10 +1000,12 @@ MetricsRegistry::registerPeerQualityGauge() ->Observe(value, {{"metric", name}}); }; - // Collect latencies and version info from each peer's JSON. + // Collect latencies, version info, and tracking state from + // each peer's JSON. std::vector latencies; int higherVersionCount = 0; int totalPeers = 0; + int divergedCount = 0; auto const ownVersion = std::string(BuildInfo::getVersionString()); app.getOverlay().foreach([&](std::shared_ptr const& peer) { @@ -1019,6 +1021,11 @@ MetricsRegistry::registerPeerQualityGauge() if (!pv.empty() && pv > ownVersion) ++higherVersionCount; } + // PeerImp::json() sets "track" to "diverged" when the peer's + // tracking state is Tracking::Diverged (i.e. it is following + // a different ledger chain than us). + if (pj.isMember(jss::track) && pj[jss::track].asString() == "diverged") + ++divergedCount; }); // P90 latency across connected peers. @@ -1041,13 +1048,11 @@ MetricsRegistry::registerPeerQualityGauge() : 0.0; observe("peers_higher_version_pct", higherPct); - // Count peers that are insane/diverged (tracking == - // Tracking::diverged). Not directly available from the Peer - // interface, so we count peers with negative or zero latency - // as a proxy for unreachable/diverged state. - // TODO: expose PeerImp::tracking_ via the Peer interface for - // a precise count. - observe("peers_insane_count", 0.0); + // Count peers diverged from our ledger chain, read from the + // peer's "track" JSON field (set by PeerImp::json()). Diverged + // peers are following a different chain and are a leading + // indicator of local sync trouble. + observe("peers_insane_count", static_cast(divergedCount)); // Binary flag: recommend upgrade if >60% run a newer version. observe("upgrade_recommended", higherPct > 60.0 ? 1.0 : 0.0); From 7a509a01ebbf056315892e4b238c9d0729bfd11d Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:02:26 +0100 Subject: [PATCH 11/16] feat(telemetry): add xrpld_ledger_history_mismatch_total{reason} counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LedgerHistory::handleMismatch() already classifies a built-vs-validated ledger mismatch (prior ledger, close time, consensus tx set, same/different tx set), but only bumped a single untyped beast::insight counter — the reason was dropped. Fork diagnosis was therefore a log-grep exercise. Add a labeled OTel counter so the mismatch reason is a queryable time series: - MetricsRegistry: new ledgerHistoryMismatchCounter_ + incrementLedgerHistoryMismatch(reason) - LedgerHistory: record one reason per classification branch (unknown, prior_ledger, close_time, consensus_txset, same_txset_diff_result, different_txset). Reaches MetricsRegistry via the existing app_ reference. The existing beast::insight mismatchCounter_ is left intact. Co-Authored-By: Claude Opus 4.8 --- src/xrpld/app/ledger/LedgerHistory.cpp | 16 ++++++++++++++++ src/xrpld/telemetry/MetricsRegistry.cpp | 12 ++++++++++++ src/xrpld/telemetry/MetricsRegistry.h | 15 +++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/src/xrpld/app/ledger/LedgerHistory.cpp b/src/xrpld/app/ledger/LedgerHistory.cpp index 8520fc941f..77c542fb16 100644 --- a/src/xrpld/app/ledger/LedgerHistory.cpp +++ b/src/xrpld/app/ledger/LedgerHistory.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -323,11 +324,19 @@ LedgerHistory::handleMismatch( auto builtLedger = getLedgerByHash(built); auto validLedger = getLedgerByHash(valid); + // Records the classified mismatch reason as a labeled OTel counter so + // fork diagnosis is a queryable time series, not just a log grep. + auto recordReason = [this](std::string_view reason) { + if (auto* mr = app_.getMetricsRegistry()) + mr->incrementLedgerHistoryMismatch(reason); + }; + if (!builtLedger || !validLedger) { JLOG(j_.error()) << "MISMATCH cannot be analyzed:" << " builtLedger: " << to_string(built) << " -> " << builtLedger << " validLedger: " << to_string(valid) << " -> " << validLedger; + recordReason("unknown"); return; } @@ -349,6 +358,7 @@ LedgerHistory::handleMismatch( if (builtLedger->header().parentHash != validLedger->header().parentHash) { JLOG(j_.error()) << "MISMATCH on prior ledger"; + recordReason("prior_ledger"); return; } @@ -356,6 +366,7 @@ LedgerHistory::handleMismatch( if (builtLedger->header().closeTime != validLedger->header().closeTime) { JLOG(j_.error()) << "MISMATCH on close time"; + recordReason("close_time"); return; } @@ -366,6 +377,7 @@ LedgerHistory::handleMismatch( JLOG(j_.error()) << "MISMATCH on consensus transaction set " << " built: " << to_string(*builtConsensusHash) << " validated: " << to_string(*validatedConsensusHash); + recordReason("consensus_txset"); } else JLOG(j_.error()) << "MISMATCH with same consensus transaction set: " @@ -379,10 +391,14 @@ LedgerHistory::handleMismatch( if (builtTx == validTx) { JLOG(j_.error()) << "MISMATCH with same " << builtTx.size() << " transactions"; + recordReason("same_txset_diff_result"); } else + { JLOG(j_.error()) << "MISMATCH with " << builtTx.size() << " built and " << validTx.size() << " valid transactions."; + recordReason("different_txset"); + } JLOG(j_.error()) << "built\n" << getJson({*builtLedger, {}}); JLOG(j_.error()) << "valid\n" << getJson({*validLedger, {}}); diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp index b7f30b6004..ef1c4ead47 100644 --- a/src/xrpld/telemetry/MetricsRegistry.cpp +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -237,6 +237,9 @@ MetricsRegistry::start(std::string const& endpoint, std::string const& instanceI meter_->CreateUInt64Counter("xrpld_state_changes_total", "Total operating mode changes"); jqTransOverflowCounter_ = meter_->CreateUInt64Counter( "xrpld_jq_trans_overflow_total", "Total job queue transaction overflows"); + ledgerHistoryMismatchCounter_ = meter_->CreateUInt64Counter( + "xrpld_ledger_history_mismatch_total", + "Total built-vs-validated ledger mismatches by reason"); validationAgreementsCounter_ = meter_->CreateUInt64Counter( "xrpld_validation_agreements_total", "Total validation agreements"); validationMissedCounter_ = @@ -1326,4 +1329,13 @@ MetricsRegistry::incrementJqTransOverflow() #endif } +void +MetricsRegistry::incrementLedgerHistoryMismatch(std::string_view reason) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (enabled_ && ledgerHistoryMismatchCounter_) + ledgerHistoryMismatchCounter_->Add(1, {{"reason", std::string(reason)}}); +#endif +} + } // namespace xrpl::telemetry diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h index 1d84932022..8ae9129758 100644 --- a/src/xrpld/telemetry/MetricsRegistry.h +++ b/src/xrpld/telemetry/MetricsRegistry.h @@ -349,6 +349,17 @@ public: void incrementJqTransOverflow(); + /** Increment the ledger_history_mismatch_total counter for a reason. + Called from LedgerHistory::handleMismatch() once the mismatch has + been classified. The reason label turns fork diagnosis from a + log-grep into a queryable time series. + @param reason Classified mismatch cause (e.g. "prior_ledger", + "close_time", "consensus_txset", "same_txset_diff_result", + "unknown"). + */ + void + incrementLedgerHistoryMismatch(std::string_view reason); + /** Access the validation agreement tracker. Used by consensus and ledger hooks to record our validations and network validations so the tracker can compute agreement percentages. @@ -483,6 +494,10 @@ private: /// Counter: xrpld_jq_trans_overflow_total — incremented on job queue transaction overflows. opentelemetry::nostd::unique_ptr> jqTransOverflowCounter_; + /// Counter: xrpld_ledger_history_mismatch_total{reason} — incremented per classified + /// built-vs-validated ledger mismatch. + opentelemetry::nostd::unique_ptr> + ledgerHistoryMismatchCounter_; /// Counter: xrpld_validation_agreements_total — incremented by ValidationTracker on /// agreement. opentelemetry::nostd::unique_ptr> From 793d2ecfcec75b6e55c7982f6e8bbeea367222d6 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:06:33 +0100 Subject: [PATCH 12/16] feat(telemetry): add txq expired/dropped counters for queue backpressure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The transaction queue had no metric for demand that leaves or never enters the queue, so fee-underpayment abandonment and admission-control rejection were invisible (distinct from jq_trans_overflow, which is the job queue). Add two synchronous counters via MetricsRegistry: - xrpld_txq_expired_total — incremented in TxQ::processClosedLedger() for each queued transaction removed because its LastLedgerSequence passed (submitters who under-bid the escalating fee and were never included). - xrpld_txq_dropped_total{reason} — incremented in TxQ::apply() at the queue-full admission-control returns (reason="queue_full"). Both reach MetricsRegistry via the Application& parameter already passed to these methods; calls are null-guarded so they no-op when telemetry is disabled. Co-Authored-By: Claude Opus 4.8 --- src/xrpld/app/misc/detail/TxQ.cpp | 10 ++++++++++ src/xrpld/telemetry/MetricsRegistry.cpp | 22 ++++++++++++++++++++++ src/xrpld/telemetry/MetricsRegistry.h | 23 +++++++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/src/xrpld/app/misc/detail/TxQ.cpp b/src/xrpld/app/misc/detail/TxQ.cpp index 352bef6bd9..ad14d6ae5f 100644 --- a/src/xrpld/app/misc/detail/TxQ.cpp +++ b/src/xrpld/app/misc/detail/TxQ.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -1254,6 +1255,8 @@ TxQ::apply( JLOG(j_.info()) << "Queue is full, and transaction " << transactionID << " would kick a transaction from the same account (" << account << ") out of the queue."; + if (auto* const metrics = app.getMetricsRegistry(); metrics != nullptr) + metrics->incrementTxqDropped("queue_full"); return {telCAN_NOT_QUEUE_FULL, false}; } auto const& endAccount = byAccount_.at(lastRIter->account); @@ -1297,6 +1300,8 @@ TxQ::apply( { JLOG(j_.info()) << "Queue is full, and transaction " << transactionID << " fee is lower than end item's account average fee"; + if (auto* const metrics = app.getMetricsRegistry(); metrics != nullptr) + metrics->incrementTxqDropped("queue_full"); return {telCAN_NOT_QUEUE_FULL, false}; } } @@ -1366,12 +1371,17 @@ TxQ::processClosedLedger(Application& app, ReadView const& view, bool timeLeap) maxSize_ = std::max(snapshot.txnsExpected * setup_.ledgersInQueue, setup_.queueSizeMin); // Remove any queued candidates whose LastLedgerSequence has gone by. + auto* const metrics = app.getMetricsRegistry(); for (auto candidateIter = byFee_.begin(); candidateIter != byFee_.end();) { if (candidateIter->lastValid && *candidateIter->lastValid <= ledgerSeq) { byAccount_.at(candidateIter->account).dropPenalty = true; candidateIter = erase(candidateIter); + // Count each expired transaction: submitters who under-bid the + // escalating fee and were never included before expiry. + if (metrics != nullptr) + metrics->incrementTxqExpired(); } else { diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp index ef1c4ead47..ea3553f12d 100644 --- a/src/xrpld/telemetry/MetricsRegistry.cpp +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -240,6 +240,10 @@ MetricsRegistry::start(std::string const& endpoint, std::string const& instanceI ledgerHistoryMismatchCounter_ = meter_->CreateUInt64Counter( "xrpld_ledger_history_mismatch_total", "Total built-vs-validated ledger mismatches by reason"); + txqExpiredCounter_ = meter_->CreateUInt64Counter( + "xrpld_txq_expired_total", "Total transactions expired out of the transaction queue"); + txqDroppedCounter_ = meter_->CreateUInt64Counter( + "xrpld_txq_dropped_total", "Total transactions refused admission to the queue by reason"); validationAgreementsCounter_ = meter_->CreateUInt64Counter( "xrpld_validation_agreements_total", "Total validation agreements"); validationMissedCounter_ = @@ -1338,4 +1342,22 @@ MetricsRegistry::incrementLedgerHistoryMismatch(std::string_view reason) #endif } +void +MetricsRegistry::incrementTxqExpired() +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (enabled_ && txqExpiredCounter_) + txqExpiredCounter_->Add(1); +#endif +} + +void +MetricsRegistry::incrementTxqDropped(std::string_view reason) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (enabled_ && txqDroppedCounter_) + txqDroppedCounter_->Add(1, {{"reason", std::string(reason)}}); +#endif +} + } // namespace xrpl::telemetry diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h index 8ae9129758..be623e4c53 100644 --- a/src/xrpld/telemetry/MetricsRegistry.h +++ b/src/xrpld/telemetry/MetricsRegistry.h @@ -360,6 +360,23 @@ public: void incrementLedgerHistoryMismatch(std::string_view reason); + /** Increment the txq_expired_total counter. + Called from TxQ::processClosedLedger() for each queued transaction + removed because its LastLedgerSequence has passed — submitters who + under-bid the escalating fee and were never included. + */ + void + incrementTxqExpired(); + + /** Increment the txq_dropped_total{reason} counter. + Called from TxQ::apply() when a transaction is refused admission to + the queue (e.g. the queue is full). Distinct from expiry (already + queued) and from jq_trans_overflow (job queue, not TxQ). + @param reason Admission-control rejection cause (e.g. "queue_full"). + */ + void + incrementTxqDropped(std::string_view reason); + /** Access the validation agreement tracker. Used by consensus and ledger hooks to record our validations and network validations so the tracker can compute agreement percentages. @@ -498,6 +515,12 @@ private: /// built-vs-validated ledger mismatch. opentelemetry::nostd::unique_ptr> ledgerHistoryMismatchCounter_; + /// Counter: xrpld_txq_expired_total — incremented per transaction expired out of the + /// transaction queue. + opentelemetry::nostd::unique_ptr> txqExpiredCounter_; + /// Counter: xrpld_txq_dropped_total{reason} — incremented when a transaction is refused + /// admission to the queue. + opentelemetry::nostd::unique_ptr> txqDroppedCounter_; /// Counter: xrpld_validation_agreements_total — incremented by ValidationTracker on /// agreement. opentelemetry::nostd::unique_ptr> From 864ac729de3723e43efdec3a47d8c4ce06014563 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:11:57 +0100 Subject: [PATCH 13/16] feat(telemetry): add ledger.acquire span for inbound ledger fetch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit InboundLedger drives ledger back-fill and fork recovery with timeout/retry logic (kLedgerTimeoutRetriesMax = 6), but emitted only a global ledger_fetches counter — sync/recovery cost was a telemetry blind spot. Add a ledger.acquire span that wraps the acquisition lifecycle: - Started in InboundLedger::init() with ledger_seq and acquire_reason (history / consensus / generic, mirroring InboundLedger::Reason). - Finalized in InboundLedger::done() with outcome (complete / failed), timeouts, and peer_count, then reset so the span duration is exported. Held as a std::optional member (same pattern as RCLConsensus roundSpan_). New op/attr/val constants added to LedgerSpanNames.h. Compiles to a no-op when telemetry is disabled via the SpanGuard fallback. Co-Authored-By: Claude Opus 4.8 --- src/xrpld/app/ledger/InboundLedger.h | 8 +++++ src/xrpld/app/ledger/detail/InboundLedger.cpp | 36 +++++++++++++++++++ src/xrpld/app/ledger/detail/LedgerSpanNames.h | 20 +++++++++++ 3 files changed, 64 insertions(+) diff --git a/src/xrpld/app/ledger/InboundLedger.h b/src/xrpld/app/ledger/InboundLedger.h index d155c5902c..d59091d6e0 100644 --- a/src/xrpld/app/ledger/InboundLedger.h +++ b/src/xrpld/app/ledger/InboundLedger.h @@ -6,8 +6,10 @@ #include #include +#include #include +#include #include #include @@ -170,6 +172,12 @@ private: receivedData_; bool receiveDispatched_{false}; std::unique_ptr peerSet_; + + /// Spans the acquire lifecycle: started in init(), finalized in done() + /// with the outcome (complete/failed), timeout count, and peer count. + /// Gives operators visibility into back-fill / fork-recovery cost, which + /// previously emitted no span or metric. + std::optional acquireSpan_; }; } // namespace xrpl diff --git a/src/xrpld/app/ledger/detail/InboundLedger.cpp b/src/xrpld/app/ledger/detail/InboundLedger.cpp index 9ba7bdf22e..423e586069 100644 --- a/src/xrpld/app/ledger/detail/InboundLedger.cpp +++ b/src/xrpld/app/ledger/detail/InboundLedger.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,8 @@ #include #include #include +#include +#include #include @@ -46,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -95,6 +99,23 @@ InboundLedger::init(ScopedLockType& collectionLock) ScopedLockType sl(mtx_); collectionLock.unlock(); + // Span the acquire lifecycle so back-fill / fork-recovery cost is + // observable. Finalized in done() with the outcome and timeout count. + { + using namespace telemetry; + acquireSpan_.emplace( + SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::acquire)); + if (*acquireSpan_) + { + acquireSpan_->setAttribute(ledger_span::attr::ledgerSeq, static_cast(seq_)); + std::string_view const reasonVal = reason_ == Reason::HISTORY + ? std::string_view(ledger_span::val::history) + : reason_ == Reason::CONSENSUS ? std::string_view(ledger_span::val::consensus) + : std::string_view(ledger_span::val::generic); + acquireSpan_->setAttribute(ledger_span::attr::acquireReason, reasonVal); + } + } + tryDB(app_.getNodeFamily().db()); if (failed_) return; @@ -416,6 +437,21 @@ InboundLedger::done() signaled_ = true; touch(); + // Finalize the acquire span with the outcome, timeout count, and peer + // count, then end it (reset) so its duration is exported. + if (acquireSpan_ && *acquireSpan_) + { + using namespace telemetry; + acquireSpan_->setAttribute( + ledger_span::attr::outcome, + failed_ ? std::string_view(ledger_span::val::failed) + : std::string_view(ledger_span::val::complete)); + acquireSpan_->setAttribute(ledger_span::attr::timeouts, static_cast(timeouts_)); + acquireSpan_->setAttribute( + ledger_span::attr::peerCount, static_cast(getPeerCount())); + } + acquireSpan_.reset(); + JLOG(journal_.debug()) << "Acquire " << hash_ << (failed_ ? " fail " : " ") << ((timeouts_ == 0) ? std::string() diff --git a/src/xrpld/app/ledger/detail/LedgerSpanNames.h b/src/xrpld/app/ledger/detail/LedgerSpanNames.h index a359e5d2c7..6dc057915f 100644 --- a/src/xrpld/app/ledger/detail/LedgerSpanNames.h +++ b/src/xrpld/app/ledger/detail/LedgerSpanNames.h @@ -10,6 +10,7 @@ * ledger.build (BuildLedger — ledger construction) * ledger.store (LedgerMaster — ledger storage) * ledger.validate (LedgerMaster — ledger validation acceptance) + * ledger.acquire (InboundLedger — fetch a missing ledger from peers) * tx.apply (BuildLedger — transaction application) */ @@ -24,6 +25,7 @@ inline constexpr auto build = makeStr("build"); inline constexpr auto store = makeStr("store"); inline constexpr auto validate = makeStr("validate"); inline constexpr auto apply = makeStr("apply"); +inline constexpr auto acquire = makeStr("acquire"); } // namespace op // ===== Attribute keys ======================================================== @@ -40,6 +42,24 @@ using ::xrpl::telemetry::attr::ledgerSeq; inline constexpr auto txCount = makeStr("tx_count"); inline constexpr auto txFailed = makeStr("tx_failed"); inline constexpr auto validations = makeStr("validations"); + +/// ledger.acquire attrs (InboundLedger fetch lifecycle). +inline constexpr auto acquireReason = makeStr("acquire_reason"); +inline constexpr auto timeouts = makeStr("timeouts"); +inline constexpr auto peerCount = makeStr("peer_count"); +inline constexpr auto outcome = makeStr("outcome"); } // namespace attr +// ===== Attribute values ====================================================== + +namespace val { +/// ledger.acquire outcome values. +inline constexpr auto complete = makeStr("complete"); +inline constexpr auto failed = makeStr("failed"); +/// ledger.acquire reason values (mirror InboundLedger::Reason). +inline constexpr auto history = makeStr("history"); +inline constexpr auto consensus = makeStr("consensus"); +inline constexpr auto generic = makeStr("generic"); +} // namespace val + } // namespace xrpl::telemetry::ledger_span From 9376aa7c8808fe165dde270936bed4a652ef1b02 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:14:33 +0100 Subject: [PATCH 14/16] feat(telemetry): add reduce-relay efficiency gauge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The transaction reduce-relay subsystem (selected vs suppressed peers, feature-disabled peers, missing-tx frequency) was computed in OverlayImpl's TxMetrics but only surfaced via the get_counts JSON RPC — invisible to Prometheus/Grafana, despite being the central efficiency KPI for the feature. Add an observable gauge xrpld_reduce_relay_metrics{metric} that reads Overlay::txMetrics() and parses its rolling-average fields: - selected_peers (txr_selected_cnt) - suppressed_peers (txr_suppressed_cnt) - not_enabled_peers (txr_not_enabled_cnt) - missing_tx_freq (txr_missing_tx_freq) The JSON values are decimal strings (std::to_string), parsed via std::stoll — the same JSON-reading pattern as registerNodeStoreGauge. No new Overlay accessor or core-interface change required. Co-Authored-By: Claude Opus 4.8 --- src/xrpld/telemetry/MetricsRegistry.cpp | 52 +++++++++++++++++++++++++ src/xrpld/telemetry/MetricsRegistry.h | 6 +++ 2 files changed, 58 insertions(+) diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp index ea3553f12d..8ca0c15889 100644 --- a/src/xrpld/telemetry/MetricsRegistry.cpp +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -436,6 +436,7 @@ MetricsRegistry::registerAsyncGauges() registerDbMetricsGauge(); registerValidatorHealthGauge(); registerPeerQualityGauge(); + registerReduceRelayGauge(); registerLedgerEconomyGauge(); registerStateTrackingGauge(); registerStorageDetailGauge(); @@ -1072,6 +1073,57 @@ MetricsRegistry::registerPeerQualityGauge() this); } +void +MetricsRegistry::registerReduceRelayGauge() +{ + // Transaction reduce-relay efficiency. Overlay::txMetrics() exposes the + // rolling averages as a JSON object with string values (std::to_string), + // so parse each field. A high suppressed:selected ratio proves the + // feature is saving bandwidth; a high not_enabled count means stale peers + // force full relay. + reduceRelayGauge_ = meter_->CreateInt64ObservableGauge( + "xrpld_reduce_relay_metrics", "Transaction reduce-relay efficiency metrics"); + reduceRelayGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; + auto& app = self->app_; + + try + { + auto const tm = app.getOverlay().txMetrics(); + + auto observe = [&](char const* name, int64_t value) { + opentelemetry::nostd::get>>(result) + ->Observe(value, {{"metric", name}}); + }; + + // Each field is a decimal string; emit when present and parseable. + auto observeField = [&](auto const& field, char const* name) { + if (tm.isMember(field)) + { + auto const s = tm[field].asString(); + if (!s.empty()) + observe(name, static_cast(std::stoll(s))); + } + }; + + observeField(jss::txr_selected_cnt, "selected_peers"); + observeField(jss::txr_suppressed_cnt, "suppressed_peers"); + observeField(jss::txr_not_enabled_cnt, "not_enabled_peers"); + observeField(jss::txr_missing_tx_freq, "missing_tx_freq"); + } + catch (...) // NOLINT(bugprone-empty-catch) + { + // Silently skip if services are not yet ready or a value is + // not parseable. + } + }, + this); +} + void MetricsRegistry::registerLedgerEconomyGauge() { diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h index be623e4c53..a16b77de3d 100644 --- a/src/xrpld/telemetry/MetricsRegistry.h +++ b/src/xrpld/telemetry/MetricsRegistry.h @@ -478,6 +478,10 @@ private: /// insane peer count, version spread, upgrade recommendation). opentelemetry::nostd::shared_ptr peerQualityGauge_; + /// Observable gauge for transaction reduce-relay efficiency (selected vs + /// suppressed peers, feature-disabled peers, missing-tx frequency). + opentelemetry::nostd::shared_ptr + reduceRelayGauge_; /// Observable gauge for ledger economy metrics (base fee, reserve, /// reserve increment, ledger age). opentelemetry::nostd::shared_ptr @@ -563,6 +567,8 @@ private: void registerPeerQualityGauge(); // Task 7.10 void + registerReduceRelayGauge(); // Reduce-relay efficiency + void registerLedgerEconomyGauge(); // Task 7.11 void registerStateTrackingGauge(); // Task 7.12 From 6205199dc75f3bbb6a3435c0e54bcea292b9d998 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:15:20 +0100 Subject: [PATCH 15/16] docs(telemetry): list new instruments in MetricsRegistry class diagram Add the new synchronous counters (ledger_history_mismatch_total{reason}, txq_expired_total, txq_dropped_total{reason}) and the reduce-relay observable gauge to the ASCII ownership diagram in the MetricsRegistry header so the documented instrument inventory matches the code. Co-Authored-By: Claude Opus 4.8 --- src/xrpld/telemetry/MetricsRegistry.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h index a16b77de3d..63a240ef75 100644 --- a/src/xrpld/telemetry/MetricsRegistry.h +++ b/src/xrpld/telemetry/MetricsRegistry.h @@ -37,6 +37,9 @@ | +-- xrpld_validations_checked_total | +-- xrpld_state_changes_total | +-- xrpld_jq_trans_overflow_total + | +-- xrpld_ledger_history_mismatch_total{reason} + | +-- xrpld_txq_expired_total + | +-- xrpld_txq_dropped_total{reason} | +-- ValidationTracker (validation agreement tracker) | @@ -53,6 +56,7 @@ +-- DB metrics (storage KB, fetch rate) +-- Validator health (amend blocked, UNL, quorum) +-- Peer quality (P90 latency, version spread) + +-- Reduce-relay efficiency (selected/suppressed peers) +-- Ledger economy (fees, reserves, age) +-- State tracking (mode value, time in state) +-- Storage detail (NuDB sizes) From 7d8e908879153eb18bee8b85406f98926f985fc9 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:16:55 +0100 Subject: [PATCH 16/16] feat(telemetry): add dashboard panels for new T3 metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Visualise the metrics added in this series: - consensus-health: "Ledger History Mismatch Rate by Reason" (xrpld_ledger_history_mismatch_total by reason — fork diagnostics) - fee-market: "Queue Abandonment Rate (Expired)" and "Queue Admission Rejections (Dropped)" (xrpld_txq_expired_total / dropped_total) - peer-network: "Reduce-Relay Peer Selection" and "Reduce-Relay Missing-Tx Frequency" (xrpld_reduce_relay_metrics) - system-node-health: "Ledger Acquire Duration" and "Ledger Acquire Rate by Outcome" (ledger.acquire span) otel-collector-config.yaml: add outcome and acquire_reason spanmetrics dimensions so the ledger.acquire outcome breakdown populates. All panels follow the existing template: $node filter, exported_instance in legends, Title Case, axis labels. Co-Authored-By: Claude Opus 4.8 --- .../grafana/dashboards/consensus-health.json | 39 ++++++++ .../grafana/dashboards/peer-network.json | 92 +++++++++++++++++++ .../dashboards/system-node-health.json | 85 +++++++++++++++++ .../grafana/dashboards/xrpld-fee-market.json | 78 ++++++++++++++++ docker/telemetry/otel-collector-config.yaml | 3 + 5 files changed, 297 insertions(+) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index d9bbcaeb67..20bad0543d 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -965,6 +965,45 @@ }, "overrides": [] } + }, + { + "title": "Ledger History Mismatch Rate by Reason", + "description": "Rate of built-vs-validated ledger mismatches broken down by reason (prior_ledger, close_time, consensus_txset, same_txset_diff_result, different_txset, unknown). Answers WHY the node forked \u2014 Byzantine close-time disagreement vs sync drift vs tx-processing difference.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 96 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (reason, exported_instance) (rate(xrpld_ledger_history_mismatch_total{exported_instance=~\"$node\"}[5m]))", + "legendFormat": "{{reason}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Mismatches / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/peer-network.json b/docker/telemetry/grafana/dashboards/peer-network.json index ff3bd53c93..dfbc751cb8 100644 --- a/docker/telemetry/grafana/dashboards/peer-network.json +++ b/docker/telemetry/grafana/dashboards/peer-network.json @@ -150,6 +150,98 @@ }, "overrides": [] } + }, + { + "title": "Reduce-Relay Peer Selection", + "description": "Transaction reduce-relay efficiency: peers selected as relay sources vs suppressed, plus peers with the feature disabled. A high suppressed:selected ratio proves reduce-relay is saving bandwidth; a high not_enabled count means stale peers force full relay.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_reduce_relay_metrics{metric=\"selected_peers\",exported_instance=~\"$node\"}", + "legendFormat": "Selected [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_reduce_relay_metrics{metric=\"suppressed_peers\",exported_instance=~\"$node\"}", + "legendFormat": "Suppressed [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_reduce_relay_metrics{metric=\"not_enabled_peers\",exported_instance=~\"$node\"}", + "legendFormat": "Not Enabled [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Peer Count", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Reduce-Relay Missing-Tx Frequency", + "description": "Frequency of on-demand transaction fetches triggered when a peer is missing a relayed transaction. A rising value means the suppression is too aggressive and the on-demand fetch path is growing.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_reduce_relay_metrics{metric=\"missing_tx_freq\",exported_instance=~\"$node\"}", + "legendFormat": "Missing Tx Freq [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Frequency", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/system-node-health.json b/docker/telemetry/grafana/dashboards/system-node-health.json index 8ee79ee498..c52b61368d 100644 --- a/docker/telemetry/grafana/dashboards/system-node-health.json +++ b/docker/telemetry/grafana/dashboards/system-node-health.json @@ -2051,6 +2051,91 @@ }, "overrides": [] } + }, + { + "title": "Ledger Acquire Duration (Inbound Fetch)", + "description": "p95/p50 duration of the ledger.acquire span (InboundLedger): how long it takes to fetch a missing ledger from peers. A spike signals the node is falling behind or recovering from a fork. Populated under back-fill / sync activity.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 126 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))", + "legendFormat": "P95 Acquire [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))", + "legendFormat": "P50 Acquire [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Ledger Acquire Rate by Outcome", + "description": "Rate of completed ledger.acquire spans broken down by outcome (complete / failed). A rising failed rate indicates the node cannot fetch needed ledgers from its peers. Requires the outcome spanmetrics dimension.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 126 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (outcome, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m]))", + "legendFormat": "{{outcome}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Acquisitions / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/xrpld-fee-market.json b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json index af3225a98a..dfeac283e7 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-fee-market.json +++ b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json @@ -375,6 +375,84 @@ }, "overrides": [] } + }, + { + "title": "Queue Abandonment Rate (Expired)", + "description": "Rate of transactions expired out of the queue (LastLedgerSequence passed). Rising expiry means submitters under-bid the escalating fee and gave up \u2014 a demand-frustration signal.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(xrpld_txq_expired_total{exported_instance=~\"$node\"}[5m]))", + "legendFormat": "Expired / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Expired / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Admission Rejections (Dropped)", + "description": "Rate of transactions refused admission to the queue, by reason. queue_full means the queue is at capacity \u2014 admission-control backpressure distinct from expiry and from job-queue overflow.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (reason, exported_instance) (rate(xrpld_txq_dropped_total{exported_instance=~\"$node\"}[5m]))", + "legendFormat": "{{reason}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Dropped / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index f282e5b5ef..988c1f3d20 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -106,6 +106,9 @@ connectors: - name: method - name: grpc_role - name: grpc_status + # ledger.acquire dimensions (bounded: outcome, acquire reason). + - name: outcome + - name: acquire_reason exporters: debug: