diff --git a/docker/telemetry/docker-compose.yml b/docker/telemetry/docker-compose.yml index b296a0e1d9..becbbb811b 100644 --- a/docker/telemetry/docker-compose.yml +++ b/docker/telemetry/docker-compose.yml @@ -96,6 +96,11 @@ services: environment: - GF_AUTH_ANONYMOUS_ENABLED=true # No login required for local dev - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin # Full access without auth + # Remote image rendering: point Grafana at the renderer container. + # These belong on the grafana service (the server delegates renders); + # the callback URL is how the renderer fetches the panel from grafana. + - GF_RENDERING_SERVER_URL=http://renderer:8081/render + - GF_RENDERING_CALLBACK_URL=http://grafana:3000/ ports: - "3000:3000" # Grafana web UI volumes: @@ -106,9 +111,18 @@ services: - tempo - prometheus - loki + - renderer networks: - xrpld-telemetry + # Grafana image renderer: a sidecar that renders panels/dashboards to PNG + # for image export and alerting. Grafana calls it at http://renderer:8081. + renderer: + image: grafana/grafana-image-renderer:latest + ports: + - "8081:8081" # Renderer HTTP endpoint (called by grafana) + networks: + - xrpld-telemetry # Named volume for Tempo trace storage (WAL and compacted blocks). # Data persists across container restarts. Remove with: # docker compose -f docker/telemetry/docker-compose.yml down -v diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 0c6914099f..20bad0543d 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -744,49 +744,13 @@ } }, { - "title": "Consensus Outcome Distribution", - "description": "Distribution of consensus.accept outcomes: yes (normal), moved_on (without full agreement), expired (timeout). Non-yes outcomes indicate network stress.", - "type": "piechart", - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 72 - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "values": ["value", "percent"] - }, - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "expr": "sum by (consensus_state) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state!=\"\"}[5m]))", - "legendFormat": "{{consensus_state}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "short" - }, - "overrides": [] - } - }, - { - "title": "Consensus Failures Over Time", - "description": "Rate of non-normal consensus outcomes (moved_on + expired). Spikes indicate consensus instability.", + "title": "Consensus Round Duration (Full Round)", + "description": "p95/p50 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.", "type": "timeseries", "gridPos": { "h": 8, - "w": 16, - "x": 8, + "w": 12, + "x": 0, "y": 72 }, "options": { @@ -800,22 +764,238 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"moved_on\"}[5m]))", - "legendFormat": "moved_on [{{exported_instance}}]" + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))", + "legendFormat": "P95 Round [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"expired\"}[5m]))", - "legendFormat": "expired [{{exported_instance}}]" + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))", + "legendFormat": "P50 Round [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Phase Duration (Open vs Establish)", + "description": "p95 duration of the open phase (transaction collection) vs the establish phase (proposal convergence). The consensus.phase.open and consensus.establish spans decompose round latency, so an operator can tell whether slowness is in collecting transactions or reaching agreement.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 72 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.phase.open\"}[5m])))", + "legendFormat": "P95 Open Phase [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.establish\"}[5m])))", + "legendFormat": "P95 Establish Phase [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Position Update Duration", + "description": "p95/p50 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 80 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))", + "legendFormat": "P95 Update [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))", + "legendFormat": "P50 Update [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Stall Rate", + "description": "Rate of consensus.check spans reporting consensus_stalled=true, broken down by stall flag. A non-zero stalled rate surfaces stall conditions before they manifest as validated-ledger-age alarms. Requires the consensus_stalled spanmetrics dimension.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 80 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.check\", consensus_stalled=\"true\"}[5m]))", + "legendFormat": "Stalled [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.check\", consensus_stalled=\"false\"}[5m]))", + "legendFormat": "Not Stalled [{{exported_instance}}]" } ], "fieldConfig": { "defaults": { "unit": "ops", "custom": { - "axisLabel": "Failures / Sec", + "axisLabel": "Checks / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Mode-Change Rate by Target Mode", + "description": "Rate of consensus.mode_change spans broken down by the mode the node switched INTO (mode_new). Frequent switches into Wrong Ledger or Switched Ledger indicate an unstable node at fork risk. Requires the mode_new spanmetrics dimension.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 88 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (mode_new, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.mode_change\"}[5m]))", + "legendFormat": "{{mode_new}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Mode Changes / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Ledger History Mismatch Rate by Reason", + "description": "Rate of built-vs-validated ledger mismatches broken down by reason (prior_ledger, close_time, consensus_txset, same_txset_diff_result, different_txset, unknown). Answers WHY the node forked \u2014 Byzantine close-time disagreement vs sync drift vs tx-processing difference.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 96 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (reason, exported_instance) (rate(xrpld_ledger_history_mismatch_total{exported_instance=~\"$node\"}[5m]))", + "legendFormat": "{{reason}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Mismatches / Sec", "spanNulls": true, "insertNulls": false, "showPoints": "auto", @@ -827,7 +1007,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "consensus", "telemetry"], + "tags": ["xrpld", "consensus"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/ledger-operations.json b/docker/telemetry/grafana/dashboards/ledger-operations.json index 2ae55fe875..6bbd6d17d0 100644 --- a/docker/telemetry/grafana/dashboards/ledger-operations.json +++ b/docker/telemetry/grafana/dashboards/ledger-operations.json @@ -319,7 +319,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "ledger", "telemetry"], + "tags": ["xrpld", "ledger"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/peer-network.json b/docker/telemetry/grafana/dashboards/peer-network.json index 1ebea03c72..dfbc751cb8 100644 --- a/docker/telemetry/grafana/dashboards/peer-network.json +++ b/docker/telemetry/grafana/dashboards/peer-network.json @@ -150,10 +150,102 @@ }, "overrides": [] } + }, + { + "title": "Reduce-Relay Peer Selection", + "description": "Transaction reduce-relay efficiency: peers selected as relay sources vs suppressed, plus peers with the feature disabled. A high suppressed:selected ratio proves reduce-relay is saving bandwidth; a high not_enabled count means stale peers force full relay.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_reduce_relay_metrics{metric=\"selected_peers\",exported_instance=~\"$node\"}", + "legendFormat": "Selected [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_reduce_relay_metrics{metric=\"suppressed_peers\",exported_instance=~\"$node\"}", + "legendFormat": "Suppressed [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_reduce_relay_metrics{metric=\"not_enabled_peers\",exported_instance=~\"$node\"}", + "legendFormat": "Not Enabled [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Peer Count", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Reduce-Relay Missing-Tx Frequency", + "description": "Frequency of on-demand transaction fetches triggered when a peer is missing a relayed transaction. A rising value means the suppression is too aggressive and the on-demand fetch path is growing.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_reduce_relay_metrics{metric=\"missing_tx_freq\",exported_instance=~\"$node\"}", + "legendFormat": "Missing Tx Freq [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Frequency", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, - "tags": ["rippled", "peer", "telemetry"], + "tags": ["xrpld", "peer"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json index d21e68b7be..45074c0eac 100644 --- a/docker/telemetry/grafana/dashboards/rpc-performance.json +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -418,7 +418,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "rpc", "telemetry"], + "tags": ["xrpld", "rpc"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/system-ledger-data-sync.json b/docker/telemetry/grafana/dashboards/system-ledger-data-sync.json index bdb62487b9..006155f672 100644 --- a/docker/telemetry/grafana/dashboards/system-ledger-data-sync.json +++ b/docker/telemetry/grafana/dashboards/system-ledger-data-sync.json @@ -493,7 +493,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "ledger", "sync", "telemetry"], + "tags": ["xrpld", "statsd", "ledger", "sync"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/system-network-traffic.json b/docker/telemetry/grafana/dashboards/system-network-traffic.json index ea4d708c74..9fb061b5a5 100644 --- a/docker/telemetry/grafana/dashboards/system-network-traffic.json +++ b/docker/telemetry/grafana/dashboards/system-network-traffic.json @@ -771,7 +771,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "network", "telemetry"], + "tags": ["xrpld", "statsd", "network"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/system-node-health.json b/docker/telemetry/grafana/dashboards/system-node-health.json index 9247c33745..c52b61368d 100644 --- a/docker/telemetry/grafana/dashboards/system-node-health.json +++ b/docker/telemetry/grafana/dashboards/system-node-health.json @@ -405,7 +405,7 @@ } }, { - "title": "--- OTel: NodeStore I/O ---", + "title": "OTel: NodeStore I/O", "type": "row", "gridPos": { "h": 1, @@ -646,7 +646,7 @@ } }, { - "title": "--- OTel: Cache Hit Rates ---", + "title": "OTel: Cache Hit Rates", "type": "row", "gridPos": { "h": 1, @@ -781,7 +781,7 @@ } }, { - "title": "--- OTel: Object Instance Counts ---", + "title": "OTel: Object Instance Counts", "type": "row", "gridPos": { "h": 1, @@ -839,7 +839,7 @@ } }, { - "title": "--- OTel: Server Info ---", + "title": "OTel: Server Info", "type": "row", "gridPos": { "h": 1, @@ -1185,7 +1185,7 @@ } }, { - "title": "--- OTel: Complete Ledgers & DB ---", + "title": "OTel: Complete Ledgers & DB", "type": "row", "gridPos": { "h": 1, @@ -1357,7 +1357,7 @@ } }, { - "title": "--- OTel: Ledger Economy ---", + "title": "OTel: Ledger Economy", "type": "row", "gridPos": { "h": 1, @@ -1562,7 +1562,7 @@ } }, { - "title": "--- Extended Metrics (Recovered from Phase 6) ---", + "title": "Extended Metrics (Recovered from Phase 6)", "type": "row", "gridPos": { "h": 1, @@ -2051,10 +2051,95 @@ }, "overrides": [] } + }, + { + "title": "Ledger Acquire Duration (Inbound Fetch)", + "description": "p95/p50 duration of the ledger.acquire span (InboundLedger): how long it takes to fetch a missing ledger from peers. A spike signals the node is falling behind or recovering from a fork. Populated under back-fill / sync activity.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 126 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))", + "legendFormat": "P95 Acquire [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))", + "legendFormat": "P50 Acquire [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Ledger Acquire Rate by Outcome", + "description": "Rate of completed ledger.acquire spans broken down by outcome (complete / failed). A rising failed rate indicates the node cannot fetch needed ledgers from its peers. Requires the outcome spanmetrics dimension.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 126 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (outcome, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m]))", + "legendFormat": "{{outcome}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Acquisitions / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "otel", "node-health", "telemetry"], + "tags": ["xrpld", "statsd", "node-health"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/system-overlay-traffic-detail.json b/docker/telemetry/grafana/dashboards/system-overlay-traffic-detail.json index 5009364ddc..496ec56c22 100644 --- a/docker/telemetry/grafana/dashboards/system-overlay-traffic-detail.json +++ b/docker/telemetry/grafana/dashboards/system-overlay-traffic-detail.json @@ -553,7 +553,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "overlay", "network", "telemetry"], + "tags": ["xrpld", "statsd", "overlay", "network"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json b/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json index 73cdeeae9e..198eceae42 100644 --- a/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json +++ b/docker/telemetry/grafana/dashboards/system-rpc-pathfinding.json @@ -380,10 +380,219 @@ }, "overrides": [] } + }, + { + "title": "gRPC Request Rate by Method (Spans)", + "description": "Per-method gRPC call rate derived from the grpc.{Method} spans (GRPCServer.cpp). Covers the gRPC API used by reporting/Clio. Populated only when the node serves gRPC traffic.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (method, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", method=~\"$grpc_method\", span_name=~\"grpc\\\\..*\"}[5m]))", + "legendFormat": "{{method}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Calls / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "gRPC Latency P95 by Method (Spans)", + "description": "p95 latency per gRPC method from grpc.{Method} span durations. Identifies slow gRPC read paths.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, method, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", method=~\"$grpc_method\", span_name=~\"grpc\\\\..*\"}[5m])))", + "legendFormat": "{{method}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "gRPC Error Rate by Status (Spans)", + "description": "Rate of gRPC spans broken down by grpc_status (success/error/resource_exhausted/failed_precondition). A rising error or resource_exhausted rate indicates gRPC clients hitting limits.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (grpc_status, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"grpc\\\\..*\", grpc_status!=\"\"}[5m]))", + "legendFormat": "{{grpc_status}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Calls / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Pathfinding Compute Duration (Spans)", + "description": "p95/p50 of the pathfind.compute span, the per-request path computation. Complements the StatsD pathfind_fast/full timers with span-level visibility. Populated under pathfinding (book/path) RPC load.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))", + "legendFormat": "P95 Compute [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))", + "legendFormat": "P50 Compute [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Pathfinding Request & Discovery Rate (Spans)", + "description": "Rate of pathfind.request (client path requests) and pathfind.discover (path-discovery passes) spans. Shows pathfinding demand and the discovery cost driver for subscription-heavy nodes.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 48 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"pathfind.request\"}[5m]))", + "legendFormat": "Requests / Sec [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"pathfind.discover\"}[5m]))", + "legendFormat": "Discoveries / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Operations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "rpc", "pathfinding", "telemetry"], + "tags": ["xrpld", "statsd", "rpc", "pathfinding"], "templating": { "list": [ { @@ -405,6 +614,26 @@ "multi": true, "refresh": 2, "sort": 1 + }, + { + "name": "grpc_method", + "label": "gRPC Method", + "description": "Filter by gRPC method (GetLedger, GetLedgerData, GetLedgerDiff, GetLedgerEntry)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total{span_name=~\"grpc\\\\..*\"}, method)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 } ] }, diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index 292efeaede..02da67e9ea 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -506,10 +506,173 @@ }, "overrides": [] } + }, + { + "title": "TxQ Enqueue Rate by Transaction Type", + "description": "Rate of txq.enqueue spans broken down by transaction type (tx_type). Shows what share of inbound demand is Payment vs OfferCreate vs other transactors, and how the mix shifts as the queue fills. A spam burst of one type is a leading indicator of fee escalation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (tx_type, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.enqueue\"}[5m]))", + "legendFormat": "{{tx_type}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Enqueues / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Bypass Ratio (Direct Apply vs Enqueue)", + "description": "Ratio of transactions that applied directly to the open ledger (txq.apply_direct) versus those that had to be queued (txq.enqueue). A falling bypass ratio is the cleanest single signal the network has entered sustained fee escalation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.apply_direct\"}[5m])) / clamp_min(sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.apply_direct\"}[5m])) + sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.enqueue\"}[5m])), 1)", + "legendFormat": "Direct-Apply Fraction [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "custom": { + "axisLabel": "Bypass Fraction", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Accept (Drain) Duration per Ledger", + "description": "p95/p50 duration of the txq.accept span, which drains queued transactions into a newly closed ledger. Rising drain time signals queue pressure at ledger close.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))", + "legendFormat": "P95 Drain [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))", + "legendFormat": "P50 Drain [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Cleanup Rate (Expired Entries)", + "description": "Rate of txq.cleanup spans, which remove expired transactions from the queue each ledger. A rising rate means submitters under-bid the escalating fee and abandoned their transactions \u2014 a demand-frustration signal distinct from acceptance throughput.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.cleanup\"}[5m]))", + "legendFormat": "Cleanups / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Cleanups / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, - "tags": ["rippled", "transactions", "telemetry"], + "tags": ["xrpld", "transactions"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-fee-market.json b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json index 474853d396..dfeac283e7 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-fee-market.json +++ b/docker/telemetry/grafana/dashboards/xrpld-fee-market.json @@ -303,10 +303,160 @@ }, "overrides": [] } + }, + { + "title": "Load Factor Attribution (Stacked Components)", + "description": "Stacked contribution of each load-factor component (fee escalation, queue, local, net, cluster) to the effective transaction cost. Shows WHICH component is driving the fee at any moment, which the aggregate load_factor hides.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_load_factor_metrics{metric=\"load_factor_fee_escalation\",exported_instance=~\"$node\"}", + "legendFormat": "Fee Escalation [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_load_factor_metrics{metric=\"load_factor_fee_queue\",exported_instance=~\"$node\"}", + "legendFormat": "Fee Queue [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_load_factor_metrics{metric=\"load_factor_local\",exported_instance=~\"$node\"}", + "legendFormat": "Local [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_load_factor_metrics{metric=\"load_factor_net\",exported_instance=~\"$node\"}", + "legendFormat": "Net [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_load_factor_metrics{metric=\"load_factor_cluster\",exported_instance=~\"$node\"}", + "legendFormat": "Cluster [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Load Factor Multiplier", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3, + "stacking": { + "mode": "normal", + "group": "A" + }, + "fillOpacity": 30 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Abandonment Rate (Expired)", + "description": "Rate of transactions expired out of the queue (LastLedgerSequence passed). Rising expiry means submitters under-bid the escalating fee and gave up \u2014 a demand-frustration signal.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(xrpld_txq_expired_total{exported_instance=~\"$node\"}[5m]))", + "legendFormat": "Expired / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Expired / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Admission Rejections (Dropped)", + "description": "Rate of transactions refused admission to the queue, by reason. queue_full means the queue is at capacity \u2014 admission-control backpressure distinct from expiry and from job-queue overflow.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (reason, exported_instance) (rate(xrpld_txq_dropped_total{exported_instance=~\"$node\"}[5m]))", + "legendFormat": "{{reason}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Dropped / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, - "tags": ["rippled", "otel", "fee-market"], + "tags": ["xrpld", "fee-market"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-job-queue.json b/docker/telemetry/grafana/dashboards/xrpld-job-queue.json index e99ae06f22..019f3c208b 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-job-queue.json +++ b/docker/telemetry/grafana/dashboards/xrpld-job-queue.json @@ -446,7 +446,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "otel", "job-queue"], + "tags": ["xrpld", "job-queue"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json b/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json index 394066a92a..6ffc83bcd9 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json +++ b/docker/telemetry/grafana/dashboards/xrpld-peer-quality.json @@ -363,7 +363,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "otel", "peer", "network", "telemetry"], + "tags": ["xrpld", "peer", "network"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json b/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json index a6686de72c..b7fc34fde1 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json +++ b/docker/telemetry/grafana/dashboards/xrpld-rpc-perf-otel.json @@ -394,7 +394,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "otel", "rpc"], + "tags": ["xrpld", "rpc"], "templating": { "list": [ { diff --git a/docker/telemetry/grafana/dashboards/xrpld-validator-health.json b/docker/telemetry/grafana/dashboards/xrpld-validator-health.json index 55bd9cedac..439eb1bf43 100644 --- a/docker/telemetry/grafana/dashboards/xrpld-validator-health.json +++ b/docker/telemetry/grafana/dashboards/xrpld-validator-health.json @@ -10,7 +10,7 @@ "links": [], "panels": [ { - "title": "--- Validation Agreement ---", + "title": "Validation Agreement", "type": "row", "gridPos": { "h": 1, @@ -256,7 +256,7 @@ } }, { - "title": "--- Validation Rates ---", + "title": "Validation Rates", "type": "row", "gridPos": { "h": 1, @@ -597,7 +597,7 @@ } }, { - "title": "--- Server State & Consensus ---", + "title": "Server State & Consensus", "type": "row", "gridPos": { "h": 1, @@ -816,10 +816,107 @@ }, "overrides": [] } + }, + { + "title": "Agreement % (7d)", + "description": "Validation agreement percentage over the trailing 7 days \u2014 the long-term reliability window used by external validator dashboards. Complements the 1h/24h stats.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 43 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_validation_agreement{metric=\"agreement_pct_7d\",exported_instance=~\"$node\"}", + "legendFormat": "Agreement 7d [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "green", + "value": 95 + } + ] + }, + "custom": {} + }, + "overrides": [] + } + }, + { + "title": "Agreements vs Missed (7d)", + "description": "Agreed vs missed validation counts over the trailing 7 days. A rising missed trend signals sustained validator unreliability.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 43 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_validation_agreement{metric=\"agreements_7d\",exported_instance=~\"$node\"}", + "legendFormat": "Agreements 7d [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "xrpld_validation_agreement{metric=\"missed_7d\",exported_instance=~\"$node\"}", + "legendFormat": "Missed 7d [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Count", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, - "tags": ["rippled", "otel", "validator", "health", "telemetry"], + "tags": ["xrpld", "validator", "health"], "templating": { "list": [ { diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index 656c7d0c8e..988c1f3d20 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -85,7 +85,7 @@ connectors: dimensions: - name: command - name: rpc_status - - name: xrpl.consensus.mode + - name: consensus_mode - name: close_time_correct - name: consensus_state - name: local @@ -97,6 +97,18 @@ connectors: - name: txq_status - name: load_type - name: is_batch + # Consensus lifecycle dimensions (low cardinality, bounded value sets). + - name: mode_new + - name: consensus_stalled + - name: consensus_phase + - name: consensus_result + # gRPC surface dimensions (bounded: method names, role, status). + - name: method + - name: grpc_role + - name: grpc_status + # ledger.acquire dimensions (bounded: outcome, acquire reason). + - name: outcome + - name: acquire_reason exporters: debug: diff --git a/docker/telemetry/xrpld-telemetry-mainnet.cfg b/docker/telemetry/xrpld-telemetry-mainnet.cfg index 5a6a4ee639..4393ad398a 100644 --- a/docker/telemetry/xrpld-telemetry-mainnet.cfg +++ b/docker/telemetry/xrpld-telemetry-mainnet.cfg @@ -111,6 +111,11 @@ docker/telemetry/data server=otel endpoint=http://localhost:4318/v1/metrics prefix=xrpld +# Sets the OTel service.instance.id resource attribute, which Prometheus +# exposes as the `exported_instance` label. Dashboards filter on it via the +# $node template variable, so without this every insight-backed panel is +# empty. Matches [telemetry] service_instance_id for a single node identity. +service_instance_id=xrpld-mainnet # --- OpenTelemetry tracing -------------------------------------------------- diff --git a/docker/telemetry/xrpld-telemetry.cfg b/docker/telemetry/xrpld-telemetry.cfg index 453fe50439..a302faa2f5 100644 --- a/docker/telemetry/xrpld-telemetry.cfg +++ b/docker/telemetry/xrpld-telemetry.cfg @@ -109,6 +109,11 @@ docker/telemetry/data server=otel endpoint=http://localhost:4318/v1/metrics prefix=xrpld +# Sets the OTel service.instance.id resource attribute, which Prometheus +# exposes as the `exported_instance` label. Dashboards filter on it via the +# $node template variable, so without this every insight-backed panel is +# empty. Matches [telemetry] service_instance_id for a single node identity. +service_instance_id=xrpld-devnet # --- OpenTelemetry tracing -------------------------------------------------- diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index 196390d0fa..8545748851 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -75,47 +75,47 @@ All spans instrumented in xrpld, grouped by subsystem: ### Transaction Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------ | --------------- | -------------------------------------------------------------------------------------- | ------------------------------------- | -| `tx.process` | NetworkOPs.cpp | `xrpl.tx.hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing | -| `tx.receive` | PeerImp.cpp | `xrpl.peer.id`, `xrpl.tx.hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | -| `tx.apply` | BuildLedger.cpp | `xrpl.ledger.seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | +| Span Name | Source File | Attributes | Description | +| ------------ | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- | +| `tx.process` | NetworkOPs.cpp | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing | +| `tx.receive` | PeerImp.cpp | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | +| `tx.apply` | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | ### Transaction Queue Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------------ | ----------- | ------------------------------------------------------------- | -------------------------------------------------- | -| `txq.enqueue` | TxQ.cpp | `xrpl.tx.hash`, `tx_type` | Transaction enqueue decision (child of tx.process) | -| `txq.apply_direct` | TxQ.cpp | -- | Direct apply attempt (bypassing queue) | -| `txq.batch_clear` | TxQ.cpp | -- | Batch clear of queued transactions for an account | -| `txq.accept` | TxQ.cpp | `queue_size`, `ledger_changed` | Ledger-close accept loop over queued transactions | -| `txq.accept_tx` | TxQ.cpp | `xrpl.tx.hash`, `retries_remaining`, `ter_code`, `txq_status` | Per-transaction apply during accept | -| `txq.cleanup` | TxQ.cpp | `xrpl.ledger.seq` | Post-close cleanup of expired queue entries | +| Span Name | Source File | Attributes | Description | +| ------------------ | ----------- | -------------------------------------------------------- | -------------------------------------------------- | +| `txq.enqueue` | TxQ.cpp | `tx_hash`, `tx_type` | Transaction enqueue decision (child of tx.process) | +| `txq.apply_direct` | TxQ.cpp | -- | Direct apply attempt (bypassing queue) | +| `txq.batch_clear` | TxQ.cpp | -- | Batch clear of queued transactions for an account | +| `txq.accept` | TxQ.cpp | `queue_size`, `ledger_changed` | Ledger-close accept loop over queued transactions | +| `txq.accept_tx` | TxQ.cpp | `tx_hash`, `retries_remaining`, `ter_code`, `txq_status` | Per-transaction apply during accept | +| `txq.cleanup` | TxQ.cpp | `ledger_seq` | Post-close cleanup of expired queue entries | ### Consensus Spans (Phase 4) -| Span Name | Source File | Attributes | Description | -| ------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `consensus.round` | RCLConsensus.cpp | `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`, `xrpl.consensus.mode`, `trace_strategy`, `xrpl.consensus.round_id` | Root span for a consensus round (deterministic or random trace ID) | -| `consensus.phase.open` | Consensus.h | -- | Open phase duration (child of round) | -| `consensus.proposal.send` | RCLConsensus.cpp | `xrpl.consensus.round`, `is_bow_out` | Consensus proposal broadcast | -| `consensus.ledger_close` | RCLConsensus.cpp | `xrpl.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | -| `consensus.establish` | Consensus.h | `converge_percent`, `establish_count`, `proposers` | Establish phase duration (child of round) | -| `consensus.update_positions` | Consensus.h | `converge_percent`, `proposers`, `disputes_count` | Position update and dispute resolution (see Events below) | -| `consensus.check` | Consensus.h | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `consensus_result` | Consensus threshold check | -| `consensus.accept` | RCLConsensus.cpp | `proposers`, `round_time_ms`, `quorum`, `disputes_count`, `consensus_state` | Ledger accepted by consensus | -| `consensus.accept.apply` | RCLConsensus.cpp | `xrpl.ledger.seq`, `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction`, `tx_count` | Ledger application with close time details (see Events below) | -| `consensus.validation.send` | RCLConsensus.cpp | `xrpl.ledger.seq`, `proposing`, `ledger_hash`, `full_validation`, `validation_sign_time` | Validation sent after accept (follows-from link) | -| `consensus.mode_change` | RCLConsensus.cpp | `mode_old`, `mode_new` | Consensus mode transition | -| `consensus.proposal.receive` | PeerImp.cpp | `trusted`, `xrpl.consensus.round` | Proposal received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | -| `consensus.validation.receive` | PeerImp.cpp | `trusted`, `xrpl.ledger.seq` | Validation received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | +| Span Name | Source File | Attributes | Description | +| ------------------------------ | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `consensus.round` | RCLConsensus.cpp | `consensus_ledger_id`, `ledger_seq`, `consensus_mode`, `trace_strategy`, `consensus_round_id` | Root span for a consensus round (deterministic or random trace ID) | +| `consensus.phase.open` | Consensus.h | -- | Open phase duration (child of round) | +| `consensus.proposal.send` | RCLConsensus.cpp | `consensus_round`, `is_bow_out` | Consensus proposal broadcast | +| `consensus.ledger_close` | RCLConsensus.cpp | `ledger_seq`, `consensus_mode` | Ledger close event | +| `consensus.establish` | Consensus.h | `converge_percent`, `establish_count`, `proposers` | Establish phase duration (child of round) | +| `consensus.update_positions` | Consensus.h | `converge_percent`, `proposers`, `disputes_count` | Position update and dispute resolution (see Events below) | +| `consensus.check` | Consensus.h | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `consensus_result` | Consensus threshold check | +| `consensus.accept` | RCLConsensus.cpp | `proposers`, `round_time_ms`, `quorum`, `disputes_count`, `consensus_state` | Ledger accepted by consensus | +| `consensus.accept.apply` | RCLConsensus.cpp | `ledger_seq`, `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction`, `tx_count` | Ledger application with close time details (see Events below) | +| `consensus.validation.send` | RCLConsensus.cpp | `ledger_seq`, `proposing`, `ledger_hash`, `full_validation`, `validation_sign_time` | Validation sent after accept (follows-from link) | +| `consensus.mode_change` | RCLConsensus.cpp | `mode_old`, `mode_new` | Consensus mode transition | +| `consensus.proposal.receive` | PeerImp.cpp | `trusted`, `consensus_round` | Proposal received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | +| `consensus.validation.receive` | PeerImp.cpp | `trusted`, `ledger_seq` | Validation received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | #### Consensus Span Events -| Parent Span | Event Name | Event Attributes | Description | -| ---------------------------- | ----------------- | ---------------------------------------------------------------- | ------------------------------------------------------- | -| `consensus.update_positions` | `dispute.resolve` | `xrpl.tx.id`, `dispute_our_vote`, `dispute_yays`, `dispute_nays` | Emitted per dispute when votes are tallied | -| `consensus.accept.apply` | `tx.included` | `xrpl.tx.id` | Emitted per transaction included in the accepted ledger | +| Parent Span | Event Name | Event Attributes | Description | +| ---------------------------- | ----------------- | ----------------------------------------------------------- | ------------------------------------------------------- | +| `consensus.update_positions` | `dispute.resolve` | `tx_id`, `dispute_our_vote`, `dispute_yays`, `dispute_nays` | Emitted per dispute when votes are tallied | +| `consensus.accept.apply` | `tx.included` | `tx_id` | Emitted per transaction included in the accepted ledger | #### Close Time Queries (Tempo TraceQL) @@ -130,10 +130,10 @@ All spans instrumented in xrpld, grouped by subsystem: {name="consensus.accept.apply"} | duration > 5s # Find specific ledger's consensus details -{name="consensus.accept.apply"} | xrpl.ledger.seq = 92345678 +{name="consensus.accept.apply"} | ledger_seq = 92345678 # Find all spans in a consensus round (deterministic trace strategy) -{name="consensus.round"} | xrpl.consensus.round_id = "" +{name="consensus.round"} | consensus_round_id = # Find dispute resolutions {name="consensus.update_positions"} >> {event:name="dispute.resolve"} @@ -141,18 +141,18 @@ All spans instrumented in xrpld, grouped by subsystem: ### Ledger Spans (Phase 6) -| Span Name | Source File | Attributes | Description | -| ----------------- | -------------------- | ------------------------------------------ | ----------------------------- | -| `ledger.build` | BuildLedger.cpp:31 | `xrpl.ledger.seq`, `tx_count`, `tx_failed` | Ledger build during consensus | -| `ledger.validate` | LedgerMaster.cpp:915 | `xrpl.ledger.seq`, `validations` | Ledger promoted to validated | -| `ledger.store` | LedgerMaster.cpp:409 | `xrpl.ledger.seq` | Ledger stored in history | +| Span Name | Source File | Attributes | Description | +| ----------------- | -------------------- | ------------------------------------- | ----------------------------- | +| `ledger.build` | BuildLedger.cpp:31 | `ledger_seq`, `tx_count`, `tx_failed` | Ledger build during consensus | +| `ledger.validate` | LedgerMaster.cpp:915 | `ledger_seq`, `validations` | Ledger promoted to validated | +| `ledger.store` | LedgerMaster.cpp:409 | `ledger_seq` | Ledger stored in history | ### Peer Spans (Phase 6) -| Span Name | Source File | Attributes | Description | -| ------------------------- | ---------------- | ------------------------------------ | ----------------------------- | -| `peer.proposal.receive` | PeerImp.cpp:1667 | `xrpl.peer.id`, `proposal_trusted` | Proposal received from peer | -| `peer.validation.receive` | PeerImp.cpp:2264 | `xrpl.peer.id`, `validation_trusted` | Validation received from peer | +| Span Name | Source File | Attributes | Description | +| ------------------------- | ---------------- | ------------------------------- | ----------------------------- | +| `peer.proposal.receive` | PeerImp.cpp:1667 | `peer_id`, `proposal_trusted` | Proposal received from peer | +| `peer.validation.receive` | PeerImp.cpp:2264 | `peer_id`, `validation_trusted` | Validation received from peer | --- @@ -210,7 +210,7 @@ This section shows what questions you can answer using the span attributes, with {name="rpc.http_request"} | request_payload_size > 100000 # Find resource-heavy RPC commands (by load_type) -{name=~"rpc.command.*"} | load_type = "exception_rpc" +{name=~"rpc.command.*"} | load_type = "exceptioned RPC" # Find a specific WebSocket command {name="rpc.ws_message"} | command = "subscribe" @@ -356,10 +356,10 @@ all its normal attributes, it just lacks a cross-node parent link. {name="consensus.proposal.receive"} && nestedSetParent > 0 # Trace a transaction across the network by its hash -{name=~"tx\\..*"} | xrpl.tx.hash = "" +{name=~"tx\\..*"} | tx_hash = "" # Find all spans in a cross-node consensus trace -{rootServiceName="xrpld"} | xrpl.consensus.round_id = "" +{rootServiceName="xrpld"} | consensus_round_id = # Compare latency between sender and receiver for validations {name="consensus.validation.send" || name="consensus.validation.receive"} @@ -389,16 +389,16 @@ Every metric carries these standard labels: | `service_name` | Resource attribute | `xrpld` | | `span_kind` | Span kind | `SPAN_KIND_INTERNAL` | -Additionally, span attributes configured as dimensions in the collector become metric labels (dots → underscores): +Additionally, span attributes configured as dimensions in the collector become metric labels. The collector dimensions use the bare attribute keys emitted by the code, so the label name equals the attribute name: -| Span Attribute | Metric Label | Applies To | -| --------------------- | ------------------------------ | ------------------------------- | -| `command` | `xrpl_rpc_command` | `rpc.command.*` spans | -| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` spans | -| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` spans | -| `local` | `xrpl_tx_local` | `tx.process` spans | -| `proposal_trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` spans | -| `validation_trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` spans | +| Span Attribute | Metric Label | Applies To | +| -------------------- | -------------------- | ------------------------------- | +| `command` | `command` | `rpc.command.*` spans | +| `rpc_status` | `rpc_status` | `rpc.command.*` spans | +| `consensus_mode` | `consensus_mode` | `consensus.ledger_close` spans | +| `local` | `local` | `tx.process` spans | +| `proposal_trusted` | `proposal_trusted` | `peer.proposal.receive` spans | +| `validation_trusted` | `validation_trusted` | `peer.validation.receive` spans | ### Histogram Buckets @@ -496,44 +496,44 @@ Ten dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: ### RPC Performance (`xrpld-rpc-perf`) -| Panel | Type | PromQL | Labels Used | -| --------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------- | -| RPC Request Rate by Command | timeseries | `sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~"rpc.command.*"}[5m]))` | `xrpl_rpc_command` | -| RPC Latency p95 by Command | timeseries | `histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])))` | `xrpl_rpc_command` | -| RPC Error Rate | bargauge | Error spans / total spans × 100, grouped by `xrpl_rpc_command` | `xrpl_rpc_command`, `status_code` | -| RPC Latency Heatmap | heatmap | `sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])) by (le)` | `le` (bucket boundaries) | -| Overall RPC Throughput | timeseries | `rpc.request` + `rpc.process` rate | — | -| RPC Success vs Error | timeseries | by `status_code` (UNSET vs ERROR) | `status_code` | -| Top Commands by Volume | bargauge | `topk(10, ...)` by `xrpl_rpc_command` | `xrpl_rpc_command` | -| WebSocket Message Rate | stat | `rpc.ws_message` rate | — | +| Panel | Type | PromQL | Labels Used | +| --------------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------ | +| RPC Request Rate by Command | timeseries | `sum by (command) (rate(traces_span_metrics_calls_total{span_name=~"rpc.command.*"}[5m]))` | `command` | +| RPC Latency p95 by Command | timeseries | `histogram_quantile(0.95, sum by (le, command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])))` | `command` | +| RPC Error Rate | bargauge | Error spans / total spans × 100, grouped by `command` | `command`, `status_code` | +| RPC Latency Heatmap | heatmap | `sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])) by (le)` | `le` (bucket boundaries) | +| Overall RPC Throughput | timeseries | `rpc.request` + `rpc.process` rate | — | +| RPC Success vs Error | timeseries | by `status_code` (UNSET vs ERROR) | `status_code` | +| Top Commands by Volume | bargauge | `topk(10, ...)` by `command` | `command` | +| WebSocket Message Rate | stat | `rpc.ws_message` rate | — | ### Transaction Overview (`xrpld-transactions`) -| Panel | Type | PromQL | Labels Used | -| --------------------------------- | ---------- | -------------------------------------------------------------------------------------------- | --------------- | -| Transaction Processing Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m])` and `tx.receive` | `span_name` | -| Transaction Processing Latency | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="tx.process"})` | — | -| Transaction Path Distribution | piechart | `sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m]))` | `xrpl_tx_local` | -| Transaction Receive vs Suppressed | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.receive"}[5m])` | — | -| TX Processing Duration Heatmap | heatmap | `tx.process` histogram buckets | `le` | -| TX Apply Duration per Ledger | timeseries | p95/p50 of `tx.apply` | — | -| Peer TX Receive Rate | timeseries | `tx.receive` rate | — | -| TX Apply Failed Rate | stat | `tx.apply` with `STATUS_CODE_ERROR` | `status_code` | +| Panel | Type | PromQL | Labels Used | +| --------------------------------- | ---------- | ------------------------------------------------------------------------------------ | ------------- | +| Transaction Processing Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m])` and `tx.receive` | `span_name` | +| Transaction Processing Latency | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="tx.process"})` | — | +| Transaction Path Distribution | piechart | `sum by (local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m]))` | `local` | +| Transaction Receive vs Suppressed | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.receive"}[5m])` | — | +| TX Processing Duration Heatmap | heatmap | `tx.process` histogram buckets | `le` | +| TX Apply Duration per Ledger | timeseries | p95/p50 of `tx.apply` | — | +| Peer TX Receive Rate | timeseries | `tx.receive` rate | — | +| TX Apply Failed Rate | stat | `tx.apply` with `STATUS_CODE_ERROR` | `status_code` | ### Consensus Health (`xrpld-consensus`) -| Panel | Type | PromQL | Labels Used | -| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | --------------------- | -| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — | -| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — | -| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — | -| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — | -| Ledger Apply Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept.apply"})` | — | -| Close Time Agreement | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.accept.apply"}[5m])` | — | -| Consensus Mode Over Time | timeseries | `consensus.ledger_close` by `xrpl_consensus_mode` | `xrpl_consensus_mode` | -| Accept vs Close Rate | timeseries | `consensus.accept` vs `consensus.ledger_close` rate | — | -| Validation vs Close Rate | timeseries | `consensus.validation.send` vs `consensus.ledger_close` | — | -| Accept Duration Heatmap | heatmap | `consensus.accept` histogram buckets | `le` | +| Panel | Type | PromQL | Labels Used | +| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | ---------------- | +| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — | +| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — | +| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — | +| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — | +| Ledger Apply Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept.apply"})` | — | +| Close Time Agreement | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.accept.apply"}[5m])` | — | +| Consensus Mode Over Time | timeseries | `consensus.ledger_close` by `consensus_mode` | `consensus_mode` | +| Accept vs Close Rate | timeseries | `consensus.accept` vs `consensus.ledger_close` rate | — | +| Validation vs Close Rate | timeseries | `consensus.validation.send` vs `consensus.ledger_close` | — | +| Accept Duration Heatmap | heatmap | `consensus.accept` histogram buckets | `le` | ### Ledger Operations (`xrpld-ledger-ops`) @@ -552,12 +552,12 @@ Ten dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: Requires `trace_peer=1` in the `[telemetry]` config section. -| Panel | Type | PromQL | Labels Used | -| -------------------------------- | ---------- | --------------------------------- | ------------------------------ | -| Proposal Receive Rate | timeseries | `peer.proposal.receive` rate | — | -| Validation Receive Rate | timeseries | `peer.validation.receive` rate | — | -| Proposals Trusted vs Untrusted | piechart | by `xrpl_peer_proposal_trusted` | `xrpl_peer_proposal_trusted` | -| Validations Trusted vs Untrusted | piechart | by `xrpl_peer_validation_trusted` | `xrpl_peer_validation_trusted` | +| Panel | Type | PromQL | Labels Used | +| -------------------------------- | ---------- | ------------------------------ | -------------------- | +| Proposal Receive Rate | timeseries | `peer.proposal.receive` rate | — | +| Validation Receive Rate | timeseries | `peer.validation.receive` rate | — | +| Proposals Trusted vs Untrusted | piechart | by `proposal_trusted` | `proposal_trusted` | +| Validations Trusted vs Untrusted | piechart | by `validation_trusted` | `validation_trusted` | ### Node Health -- System Metrics (`xrpld-system-node-health`) diff --git a/src/xrpld/app/ledger/InboundLedger.h b/src/xrpld/app/ledger/InboundLedger.h index d155c5902c..d59091d6e0 100644 --- a/src/xrpld/app/ledger/InboundLedger.h +++ b/src/xrpld/app/ledger/InboundLedger.h @@ -6,8 +6,10 @@ #include #include +#include #include +#include #include #include @@ -170,6 +172,12 @@ private: receivedData_; bool receiveDispatched_{false}; std::unique_ptr peerSet_; + + /// Spans the acquire lifecycle: started in init(), finalized in done() + /// with the outcome (complete/failed), timeout count, and peer count. + /// Gives operators visibility into back-fill / fork-recovery cost, which + /// previously emitted no span or metric. + std::optional acquireSpan_; }; } // namespace xrpl diff --git a/src/xrpld/app/ledger/LedgerHistory.cpp b/src/xrpld/app/ledger/LedgerHistory.cpp index 8520fc941f..77c542fb16 100644 --- a/src/xrpld/app/ledger/LedgerHistory.cpp +++ b/src/xrpld/app/ledger/LedgerHistory.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -323,11 +324,19 @@ LedgerHistory::handleMismatch( auto builtLedger = getLedgerByHash(built); auto validLedger = getLedgerByHash(valid); + // Records the classified mismatch reason as a labeled OTel counter so + // fork diagnosis is a queryable time series, not just a log grep. + auto recordReason = [this](std::string_view reason) { + if (auto* mr = app_.getMetricsRegistry()) + mr->incrementLedgerHistoryMismatch(reason); + }; + if (!builtLedger || !validLedger) { JLOG(j_.error()) << "MISMATCH cannot be analyzed:" << " builtLedger: " << to_string(built) << " -> " << builtLedger << " validLedger: " << to_string(valid) << " -> " << validLedger; + recordReason("unknown"); return; } @@ -349,6 +358,7 @@ LedgerHistory::handleMismatch( if (builtLedger->header().parentHash != validLedger->header().parentHash) { JLOG(j_.error()) << "MISMATCH on prior ledger"; + recordReason("prior_ledger"); return; } @@ -356,6 +366,7 @@ LedgerHistory::handleMismatch( if (builtLedger->header().closeTime != validLedger->header().closeTime) { JLOG(j_.error()) << "MISMATCH on close time"; + recordReason("close_time"); return; } @@ -366,6 +377,7 @@ LedgerHistory::handleMismatch( JLOG(j_.error()) << "MISMATCH on consensus transaction set " << " built: " << to_string(*builtConsensusHash) << " validated: " << to_string(*validatedConsensusHash); + recordReason("consensus_txset"); } else JLOG(j_.error()) << "MISMATCH with same consensus transaction set: " @@ -379,10 +391,14 @@ LedgerHistory::handleMismatch( if (builtTx == validTx) { JLOG(j_.error()) << "MISMATCH with same " << builtTx.size() << " transactions"; + recordReason("same_txset_diff_result"); } else + { JLOG(j_.error()) << "MISMATCH with " << builtTx.size() << " built and " << validTx.size() << " valid transactions."; + recordReason("different_txset"); + } JLOG(j_.error()) << "built\n" << getJson({*builtLedger, {}}); JLOG(j_.error()) << "valid\n" << getJson({*validLedger, {}}); diff --git a/src/xrpld/app/ledger/detail/InboundLedger.cpp b/src/xrpld/app/ledger/detail/InboundLedger.cpp index 9ba7bdf22e..423e586069 100644 --- a/src/xrpld/app/ledger/detail/InboundLedger.cpp +++ b/src/xrpld/app/ledger/detail/InboundLedger.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,8 @@ #include #include #include +#include +#include #include @@ -46,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -95,6 +99,23 @@ InboundLedger::init(ScopedLockType& collectionLock) ScopedLockType sl(mtx_); collectionLock.unlock(); + // Span the acquire lifecycle so back-fill / fork-recovery cost is + // observable. Finalized in done() with the outcome and timeout count. + { + using namespace telemetry; + acquireSpan_.emplace( + SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::acquire)); + if (*acquireSpan_) + { + acquireSpan_->setAttribute(ledger_span::attr::ledgerSeq, static_cast(seq_)); + std::string_view const reasonVal = reason_ == Reason::HISTORY + ? std::string_view(ledger_span::val::history) + : reason_ == Reason::CONSENSUS ? std::string_view(ledger_span::val::consensus) + : std::string_view(ledger_span::val::generic); + acquireSpan_->setAttribute(ledger_span::attr::acquireReason, reasonVal); + } + } + tryDB(app_.getNodeFamily().db()); if (failed_) return; @@ -416,6 +437,21 @@ InboundLedger::done() signaled_ = true; touch(); + // Finalize the acquire span with the outcome, timeout count, and peer + // count, then end it (reset) so its duration is exported. + if (acquireSpan_ && *acquireSpan_) + { + using namespace telemetry; + acquireSpan_->setAttribute( + ledger_span::attr::outcome, + failed_ ? std::string_view(ledger_span::val::failed) + : std::string_view(ledger_span::val::complete)); + acquireSpan_->setAttribute(ledger_span::attr::timeouts, static_cast(timeouts_)); + acquireSpan_->setAttribute( + ledger_span::attr::peerCount, static_cast(getPeerCount())); + } + acquireSpan_.reset(); + JLOG(journal_.debug()) << "Acquire " << hash_ << (failed_ ? " fail " : " ") << ((timeouts_ == 0) ? std::string() diff --git a/src/xrpld/app/ledger/detail/LedgerSpanNames.h b/src/xrpld/app/ledger/detail/LedgerSpanNames.h index a359e5d2c7..6dc057915f 100644 --- a/src/xrpld/app/ledger/detail/LedgerSpanNames.h +++ b/src/xrpld/app/ledger/detail/LedgerSpanNames.h @@ -10,6 +10,7 @@ * ledger.build (BuildLedger — ledger construction) * ledger.store (LedgerMaster — ledger storage) * ledger.validate (LedgerMaster — ledger validation acceptance) + * ledger.acquire (InboundLedger — fetch a missing ledger from peers) * tx.apply (BuildLedger — transaction application) */ @@ -24,6 +25,7 @@ inline constexpr auto build = makeStr("build"); inline constexpr auto store = makeStr("store"); inline constexpr auto validate = makeStr("validate"); inline constexpr auto apply = makeStr("apply"); +inline constexpr auto acquire = makeStr("acquire"); } // namespace op // ===== Attribute keys ======================================================== @@ -40,6 +42,24 @@ using ::xrpl::telemetry::attr::ledgerSeq; inline constexpr auto txCount = makeStr("tx_count"); inline constexpr auto txFailed = makeStr("tx_failed"); inline constexpr auto validations = makeStr("validations"); + +/// ledger.acquire attrs (InboundLedger fetch lifecycle). +inline constexpr auto acquireReason = makeStr("acquire_reason"); +inline constexpr auto timeouts = makeStr("timeouts"); +inline constexpr auto peerCount = makeStr("peer_count"); +inline constexpr auto outcome = makeStr("outcome"); } // namespace attr +// ===== Attribute values ====================================================== + +namespace val { +/// ledger.acquire outcome values. +inline constexpr auto complete = makeStr("complete"); +inline constexpr auto failed = makeStr("failed"); +/// ledger.acquire reason values (mirror InboundLedger::Reason). +inline constexpr auto history = makeStr("history"); +inline constexpr auto consensus = makeStr("consensus"); +inline constexpr auto generic = makeStr("generic"); +} // namespace val + } // namespace xrpl::telemetry::ledger_span diff --git a/src/xrpld/app/misc/detail/TxQ.cpp b/src/xrpld/app/misc/detail/TxQ.cpp index 352bef6bd9..ad14d6ae5f 100644 --- a/src/xrpld/app/misc/detail/TxQ.cpp +++ b/src/xrpld/app/misc/detail/TxQ.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -1254,6 +1255,8 @@ TxQ::apply( JLOG(j_.info()) << "Queue is full, and transaction " << transactionID << " would kick a transaction from the same account (" << account << ") out of the queue."; + if (auto* const metrics = app.getMetricsRegistry(); metrics != nullptr) + metrics->incrementTxqDropped("queue_full"); return {telCAN_NOT_QUEUE_FULL, false}; } auto const& endAccount = byAccount_.at(lastRIter->account); @@ -1297,6 +1300,8 @@ TxQ::apply( { JLOG(j_.info()) << "Queue is full, and transaction " << transactionID << " fee is lower than end item's account average fee"; + if (auto* const metrics = app.getMetricsRegistry(); metrics != nullptr) + metrics->incrementTxqDropped("queue_full"); return {telCAN_NOT_QUEUE_FULL, false}; } } @@ -1366,12 +1371,17 @@ TxQ::processClosedLedger(Application& app, ReadView const& view, bool timeLeap) maxSize_ = std::max(snapshot.txnsExpected * setup_.ledgersInQueue, setup_.queueSizeMin); // Remove any queued candidates whose LastLedgerSequence has gone by. + auto* const metrics = app.getMetricsRegistry(); for (auto candidateIter = byFee_.begin(); candidateIter != byFee_.end();) { if (candidateIter->lastValid && *candidateIter->lastValid <= ledgerSeq) { byAccount_.at(candidateIter->account).dropPenalty = true; candidateIter = erase(candidateIter); + // Count each expired transaction: submitters who under-bid the + // escalating fee and were never included before expiry. + if (metrics != nullptr) + metrics->incrementTxqExpired(); } else { diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp index 79e76ea737..8ca0c15889 100644 --- a/src/xrpld/telemetry/MetricsRegistry.cpp +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -237,6 +237,13 @@ MetricsRegistry::start(std::string const& endpoint, std::string const& instanceI meter_->CreateUInt64Counter("xrpld_state_changes_total", "Total operating mode changes"); jqTransOverflowCounter_ = meter_->CreateUInt64Counter( "xrpld_jq_trans_overflow_total", "Total job queue transaction overflows"); + ledgerHistoryMismatchCounter_ = meter_->CreateUInt64Counter( + "xrpld_ledger_history_mismatch_total", + "Total built-vs-validated ledger mismatches by reason"); + txqExpiredCounter_ = meter_->CreateUInt64Counter( + "xrpld_txq_expired_total", "Total transactions expired out of the transaction queue"); + txqDroppedCounter_ = meter_->CreateUInt64Counter( + "xrpld_txq_dropped_total", "Total transactions refused admission to the queue by reason"); validationAgreementsCounter_ = meter_->CreateUInt64Counter( "xrpld_validation_agreements_total", "Total validation agreements"); validationMissedCounter_ = @@ -429,6 +436,7 @@ MetricsRegistry::registerAsyncGauges() registerDbMetricsGauge(); registerValidatorHealthGauge(); registerPeerQualityGauge(); + registerReduceRelayGauge(); registerLedgerEconomyGauge(); registerStateTrackingGauge(); registerStorageDetailGauge(); @@ -1000,10 +1008,12 @@ MetricsRegistry::registerPeerQualityGauge() ->Observe(value, {{"metric", name}}); }; - // Collect latencies and version info from each peer's JSON. + // Collect latencies, version info, and tracking state from + // each peer's JSON. std::vector latencies; int higherVersionCount = 0; int totalPeers = 0; + int divergedCount = 0; auto const ownVersion = std::string(BuildInfo::getVersionString()); app.getOverlay().foreach([&](std::shared_ptr const& peer) { @@ -1019,6 +1029,11 @@ MetricsRegistry::registerPeerQualityGauge() if (!pv.empty() && pv > ownVersion) ++higherVersionCount; } + // PeerImp::json() sets "track" to "diverged" when the peer's + // tracking state is Tracking::Diverged (i.e. it is following + // a different ledger chain than us). + if (pj.isMember(jss::track) && pj[jss::track].asString() == "diverged") + ++divergedCount; }); // P90 latency across connected peers. @@ -1041,13 +1056,11 @@ MetricsRegistry::registerPeerQualityGauge() : 0.0; observe("peers_higher_version_pct", higherPct); - // Count peers that are insane/diverged (tracking == - // Tracking::diverged). Not directly available from the Peer - // interface, so we count peers with negative or zero latency - // as a proxy for unreachable/diverged state. - // TODO: expose PeerImp::tracking_ via the Peer interface for - // a precise count. - observe("peers_insane_count", 0.0); + // Count peers diverged from our ledger chain, read from the + // peer's "track" JSON field (set by PeerImp::json()). Diverged + // peers are following a different chain and are a leading + // indicator of local sync trouble. + observe("peers_insane_count", static_cast(divergedCount)); // Binary flag: recommend upgrade if >60% run a newer version. observe("upgrade_recommended", higherPct > 60.0 ? 1.0 : 0.0); @@ -1060,6 +1073,57 @@ MetricsRegistry::registerPeerQualityGauge() this); } +void +MetricsRegistry::registerReduceRelayGauge() +{ + // Transaction reduce-relay efficiency. Overlay::txMetrics() exposes the + // rolling averages as a JSON object with string values (std::to_string), + // so parse each field. A high suppressed:selected ratio proves the + // feature is saving bandwidth; a high not_enabled count means stale peers + // force full relay. + reduceRelayGauge_ = meter_->CreateInt64ObservableGauge( + "xrpld_reduce_relay_metrics", "Transaction reduce-relay efficiency metrics"); + reduceRelayGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + if (self->callbacksDetached_.load(std::memory_order_acquire)) + return; + auto& app = self->app_; + + try + { + auto const tm = app.getOverlay().txMetrics(); + + auto observe = [&](char const* name, int64_t value) { + opentelemetry::nostd::get>>(result) + ->Observe(value, {{"metric", name}}); + }; + + // Each field is a decimal string; emit when present and parseable. + auto observeField = [&](auto const& field, char const* name) { + if (tm.isMember(field)) + { + auto const s = tm[field].asString(); + if (!s.empty()) + observe(name, static_cast(std::stoll(s))); + } + }; + + observeField(jss::txr_selected_cnt, "selected_peers"); + observeField(jss::txr_suppressed_cnt, "suppressed_peers"); + observeField(jss::txr_not_enabled_cnt, "not_enabled_peers"); + observeField(jss::txr_missing_tx_freq, "missing_tx_freq"); + } + catch (...) // NOLINT(bugprone-empty-catch) + { + // Silently skip if services are not yet ready or a value is + // not parseable. + } + }, + this); +} + void MetricsRegistry::registerLedgerEconomyGauge() { @@ -1321,4 +1385,31 @@ MetricsRegistry::incrementJqTransOverflow() #endif } +void +MetricsRegistry::incrementLedgerHistoryMismatch(std::string_view reason) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (enabled_ && ledgerHistoryMismatchCounter_) + ledgerHistoryMismatchCounter_->Add(1, {{"reason", std::string(reason)}}); +#endif +} + +void +MetricsRegistry::incrementTxqExpired() +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (enabled_ && txqExpiredCounter_) + txqExpiredCounter_->Add(1); +#endif +} + +void +MetricsRegistry::incrementTxqDropped(std::string_view reason) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (enabled_ && txqDroppedCounter_) + txqDroppedCounter_->Add(1, {{"reason", std::string(reason)}}); +#endif +} + } // namespace xrpl::telemetry diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h index 1d84932022..63a240ef75 100644 --- a/src/xrpld/telemetry/MetricsRegistry.h +++ b/src/xrpld/telemetry/MetricsRegistry.h @@ -37,6 +37,9 @@ | +-- xrpld_validations_checked_total | +-- xrpld_state_changes_total | +-- xrpld_jq_trans_overflow_total + | +-- xrpld_ledger_history_mismatch_total{reason} + | +-- xrpld_txq_expired_total + | +-- xrpld_txq_dropped_total{reason} | +-- ValidationTracker (validation agreement tracker) | @@ -53,6 +56,7 @@ +-- DB metrics (storage KB, fetch rate) +-- Validator health (amend blocked, UNL, quorum) +-- Peer quality (P90 latency, version spread) + +-- Reduce-relay efficiency (selected/suppressed peers) +-- Ledger economy (fees, reserves, age) +-- State tracking (mode value, time in state) +-- Storage detail (NuDB sizes) @@ -349,6 +353,34 @@ public: void incrementJqTransOverflow(); + /** Increment the ledger_history_mismatch_total counter for a reason. + Called from LedgerHistory::handleMismatch() once the mismatch has + been classified. The reason label turns fork diagnosis from a + log-grep into a queryable time series. + @param reason Classified mismatch cause (e.g. "prior_ledger", + "close_time", "consensus_txset", "same_txset_diff_result", + "unknown"). + */ + void + incrementLedgerHistoryMismatch(std::string_view reason); + + /** Increment the txq_expired_total counter. + Called from TxQ::processClosedLedger() for each queued transaction + removed because its LastLedgerSequence has passed — submitters who + under-bid the escalating fee and were never included. + */ + void + incrementTxqExpired(); + + /** Increment the txq_dropped_total{reason} counter. + Called from TxQ::apply() when a transaction is refused admission to + the queue (e.g. the queue is full). Distinct from expiry (already + queued) and from jq_trans_overflow (job queue, not TxQ). + @param reason Admission-control rejection cause (e.g. "queue_full"). + */ + void + incrementTxqDropped(std::string_view reason); + /** Access the validation agreement tracker. Used by consensus and ledger hooks to record our validations and network validations so the tracker can compute agreement percentages. @@ -450,6 +482,10 @@ private: /// insane peer count, version spread, upgrade recommendation). opentelemetry::nostd::shared_ptr peerQualityGauge_; + /// Observable gauge for transaction reduce-relay efficiency (selected vs + /// suppressed peers, feature-disabled peers, missing-tx frequency). + opentelemetry::nostd::shared_ptr + reduceRelayGauge_; /// Observable gauge for ledger economy metrics (base fee, reserve, /// reserve increment, ledger age). opentelemetry::nostd::shared_ptr @@ -483,6 +519,16 @@ private: /// Counter: xrpld_jq_trans_overflow_total — incremented on job queue transaction overflows. opentelemetry::nostd::unique_ptr> jqTransOverflowCounter_; + /// Counter: xrpld_ledger_history_mismatch_total{reason} — incremented per classified + /// built-vs-validated ledger mismatch. + opentelemetry::nostd::unique_ptr> + ledgerHistoryMismatchCounter_; + /// Counter: xrpld_txq_expired_total — incremented per transaction expired out of the + /// transaction queue. + opentelemetry::nostd::unique_ptr> txqExpiredCounter_; + /// Counter: xrpld_txq_dropped_total{reason} — incremented when a transaction is refused + /// admission to the queue. + opentelemetry::nostd::unique_ptr> txqDroppedCounter_; /// Counter: xrpld_validation_agreements_total — incremented by ValidationTracker on /// agreement. opentelemetry::nostd::unique_ptr> @@ -525,6 +571,8 @@ private: void registerPeerQualityGauge(); // Task 7.10 void + registerReduceRelayGauge(); // Reduce-relay efficiency + void registerLedgerEconomyGauge(); // Task 7.11 void registerStateTrackingGauge(); // Task 7.12