From 9e27120a15ff1703fafe82429ce147e4613be863 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 13 May 2026 16:16:30 +0100 Subject: [PATCH] refactor(telemetry): simplify ledger/peer attr naming on phase-6, update dashboards - Add canonical ledgerHash (xrpl.ledger.hash) to SpanNames.h. - LedgerSpanNames: reuse shared canonicals (ledgerSeq, closeTime, closeTimeCorrect, closeResolutionMs, ledgerHash); bare names for tx_count, tx_failed, validations. - PeerSpanNames: reuse shared canonicals (peerId, ledgerHash); bare names for proposal_trusted, validation_full, validation_trusted. - Update call sites in BuildLedger.cpp, LedgerMaster.cpp, PeerImp.cpp. - Update 5 Grafana dashboards: strip xrpl.. prefix from per-span attr refs in PromQL/TraceQL queries. Keep rule-5 entries. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../grafana/dashboards/consensus-health.json | 32 ++++++++--------- .../grafana/dashboards/ledger-operations.json | 4 +-- .../grafana/dashboards/peer-network.json | 20 +++++------ .../grafana/dashboards/rpc-performance.json | 34 +++++++++---------- .../dashboards/transaction-overview.json | 10 +++--- include/xrpl/telemetry/SpanNames.h | 1 + src/xrpld/app/ledger/detail/BuildLedger.cpp | 2 +- src/xrpld/app/ledger/detail/LedgerMaster.cpp | 4 +-- src/xrpld/app/ledger/detail/LedgerSpanNames.h | 25 ++++++-------- src/xrpld/overlay/detail/PeerImp.cpp | 7 ++-- src/xrpld/overlay/detail/PeerSpanNames.h | 22 ++++-------- 11 files changed, 74 insertions(+), 87 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 0ba69c139e..d40f42cc58 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -10,7 +10,7 @@ "panels": [ { "title": "Consensus Round Duration", - "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp:395) measures the time to process an accepted ledger including transaction application and state finalization. The span carries xrpl.consensus.proposers and xrpl.consensus.round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", + "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp:395) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", "type": "timeseries", "gridPos": { "h": 8, @@ -95,7 +95,7 @@ }, { "title": "Ledger Close Duration", - "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp:282) measures the time from when consensus triggers a ledger close to completion. Carries xrpl.consensus.ledger.seq and xrpl.consensus.mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", + "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp:282) measures the time from when consensus triggers a ledger close to completion. Carries xrpl.ledger.seq and xrpl.consensus.mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", "type": "timeseries", "gridPos": { "h": 8, @@ -134,7 +134,7 @@ }, { "title": "Validation Send Rate", - "description": "Rate at which this node sends ledger validations to the network. Sourced from the consensus.validation.send span (RCLConsensus.cpp:753). Each validation confirms the node has fully validated a ledger. The span carries xrpl.consensus.ledger.seq and xrpl.consensus.proposing. Should closely track the ledger close rate when the node is healthy.", + "description": "Rate at which this node sends ledger validations to the network. Sourced from the consensus.validation.send span (RCLConsensus.cpp:753). Each validation confirms the node has fully validated a ledger. The span carries xrpl.ledger.seq and proposing. Should closely track the ledger close rate when the node is healthy.", "type": "stat", "gridPos": { "h": 8, @@ -206,7 +206,7 @@ }, { "title": "Close Time Agreement", - "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on xrpl.consensus.close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).", + "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).", "type": "timeseries", "gridPos": { "h": 8, @@ -219,8 +219,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_consensus_close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", xrpl_consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))", - "legendFormat": "Close Time Correct={{xrpl_consensus_close_time_correct}} [{{exported_instance}}]" + "expr": "sum by (close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", xrpl_consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))", + "legendFormat": "Close Time Correct={{close_time_correct}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -400,7 +400,7 @@ }, { "title": "Close Time: Raw Proposals (Per Node)", - "description": "Each node's raw proposed close time (xrpl.consensus.close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift.", + "description": "Each node's raw proposed close time (close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift.", "type": "timeseries", "gridPos": { "h": 8, @@ -436,14 +436,14 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\"} | select(span.xrpl.consensus.close_time_self)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time_self)", "refId": "A" } ] }, { "title": "Close Time: Effective / Quantized", - "description": "The consensus-agreed close time after rounding to the current resolution bin (xrpl.consensus.close_time). This is the value written to the ledger header. All nodes in agreement produce the same value.", + "description": "The consensus-agreed close time after rounding to the current resolution bin (close_time). This is the value written to the ledger header. All nodes in agreement produce the same value.", "type": "timeseries", "gridPos": { "h": 8, @@ -479,14 +479,14 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\"} | select(span.xrpl.consensus.close_time)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time)", "refId": "A" } ] }, { "title": "Close Time Vote Bins & Resolution", - "description": "Number of distinct close time vote bins (xrpl.consensus.close_time_vote_bins) and the bin size / resolution in ms (xrpl.consensus.close_resolution_ms). More bins = more clock disagreement. Resolution adapts: finer (10s) when validators agree, coarser (120s) when they disagree.", + "description": "Number of distinct close time vote bins (close_time_vote_bins) and the bin size / resolution in ms (close_resolution_ms). More bins = more clock disagreement. Resolution adapts: finer (10s) when validators agree, coarser (120s) when they disagree.", "type": "timeseries", "gridPos": { "h": 8, @@ -555,7 +555,7 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\"} | select(span.xrpl.consensus.close_time_vote_bins)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time_vote_bins)", "refId": "A" }, { @@ -563,14 +563,14 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\"} | select(span.xrpl.consensus.close_resolution_ms)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_resolution_ms)", "refId": "B" } ] }, { "title": "Close Time Resolution Direction", - "description": "Whether close time resolution increased (coarser bins, more disagreement), decreased (finer bins, better agreement), or stayed unchanged relative to the previous ledger. Based on xrpl.consensus.resolution_direction attribute.", + "description": "Whether close time resolution increased (coarser bins, more disagreement), decreased (finer bins, better agreement), or stayed unchanged relative to the previous ledger. Based on resolution_direction attribute.", "type": "timeseries", "gridPos": { "h": 8, @@ -606,7 +606,7 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\" && span.xrpl.consensus.resolution_direction=~\"$resolution_direction\"} | select(span.xrpl.consensus.resolution_direction)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\" && span.resolution_direction=~\"$resolution_direction\"} | select(span.resolution_direction)", "refId": "A" } ] @@ -650,7 +650,7 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\"} | select(span.xrpl.consensus.close_time, span.xrpl.consensus.close_time_vote_bins)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time, span.close_time_vote_bins)", "refId": "A" } ] diff --git a/docker/telemetry/grafana/dashboards/ledger-operations.json b/docker/telemetry/grafana/dashboards/ledger-operations.json index 67711e4fa8..c9c8c5efc3 100644 --- a/docker/telemetry/grafana/dashboards/ledger-operations.json +++ b/docker/telemetry/grafana/dashboards/ledger-operations.json @@ -88,7 +88,7 @@ }, { "title": "Ledger Validation Rate", - "description": "Rate at which ledgers pass the validation threshold and are accepted as fully validated. The ledger.validate span (LedgerMaster.cpp:915) fires in checkAccept() only after the ledger receives sufficient trusted validations (>= quorum). Records xrpl.ledger.seq and xrpl.ledger.validations (the number of validations received).", + "description": "Rate at which ledgers pass the validation threshold and are accepted as fully validated. The ledger.validate span (LedgerMaster.cpp:915) fires in checkAccept() only after the ledger receives sufficient trusted validations (>= quorum). Records xrpl.ledger.seq and validations (the number of validations received).", "type": "stat", "gridPos": { "h": 8, @@ -156,7 +156,7 @@ }, { "title": "Transaction Apply Duration", - "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp:88) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records xrpl.ledger.tx_count (successful) and xrpl.ledger.tx_failed (failed) as attributes.", + "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp:88) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records tx_count (successful) and tx_failed (failed) as attributes.", "type": "timeseries", "gridPos": { "h": 8, diff --git a/docker/telemetry/grafana/dashboards/peer-network.json b/docker/telemetry/grafana/dashboards/peer-network.json index 9740b04366..0fd6e6048f 100644 --- a/docker/telemetry/grafana/dashboards/peer-network.json +++ b/docker/telemetry/grafana/dashboards/peer-network.json @@ -11,7 +11,7 @@ "panels": [ { "title": "Peer Proposal Receive Rate", - "description": "Rate of consensus proposals received from network peers. The peer.proposal.receive span (PeerImp.cpp:1667) fires in onMessage(TMProposeSet) for each incoming proposal. Records xrpl.peer.id (sending peer) and xrpl.peer.proposal.trusted (whether the proposer is in our UNL). Requires trace_peer=1 in the telemetry config.", + "description": "Rate of consensus proposals received from network peers. The peer.proposal.receive span (PeerImp.cpp:1667) fires in onMessage(TMProposeSet) for each incoming proposal. Records xrpl.peer.id (sending peer) and proposal_trusted (whether the proposer is in our UNL). Requires trace_peer=1 in the telemetry config.", "type": "timeseries", "gridPos": { "h": 8, @@ -50,7 +50,7 @@ }, { "title": "Peer Validation Receive Rate", - "description": "Rate of ledger validations received from network peers. The peer.validation.receive span (PeerImp.cpp:2264) fires in onMessage(TMValidation) for each incoming validation message. Records xrpl.peer.id (sending peer) and xrpl.peer.validation.trusted (whether the validator is trusted). Requires trace_peer=1 in the telemetry config.", + "description": "Rate of ledger validations received from network peers. The peer.validation.receive span (PeerImp.cpp:2264) fires in onMessage(TMValidation) for each incoming validation message. Records xrpl.peer.id (sending peer) and validation_trusted (whether the validator is trusted). Requires trace_peer=1 in the telemetry config.", "type": "timeseries", "gridPos": { "h": 8, @@ -89,7 +89,7 @@ }, { "title": "Proposals Trusted vs Untrusted", - "description": "Pie chart showing the ratio of proposals received from trusted validators (in our UNL) vs untrusted validators. Grouped by the xrpl.peer.proposal.trusted span attribute (true/false). A healthy node connected to a well-configured UNL should see a significant portion of trusted proposals. Note: proposals that fail early validation may not have the trusted attribute set.", + "description": "Pie chart showing the ratio of proposals received from trusted validators (in our UNL) vs untrusted validators. Grouped by the proposal_trusted span attribute (true/false). A healthy node connected to a well-configured UNL should see a significant portion of trusted proposals. Note: proposals that fail early validation may not have the trusted attribute set.", "type": "piechart", "gridPos": { "h": 8, @@ -108,8 +108,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_peer_proposal_trusted, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_peer_proposal_trusted=~\"$proposal_trusted\", span_name=\"peer.proposal.receive\"}[5m]))", - "legendFormat": "Trusted = {{xrpl_peer_proposal_trusted}} [{{exported_instance}}]" + "expr": "sum by (proposal_trusted, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", proposal_trusted=~\"$proposal_trusted\", span_name=\"peer.proposal.receive\"}[5m]))", + "legendFormat": "Trusted = {{proposal_trusted}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -121,7 +121,7 @@ }, { "title": "Validations Trusted vs Untrusted", - "description": "Pie chart showing the ratio of validations received from trusted validators (in our UNL) vs untrusted validators. Grouped by the xrpl.peer.validation.trusted span attribute (true/false). Monitoring this helps detect if the node is receiving validations from the expected set of trusted validators. Note: validations that fail early checks may not have the trusted attribute set.", + "description": "Pie chart showing the ratio of validations received from trusted validators (in our UNL) vs untrusted validators. Grouped by the validation_trusted span attribute (true/false). Monitoring this helps detect if the node is receiving validations from the expected set of trusted validators. Note: validations that fail early checks may not have the trusted attribute set.", "type": "piechart", "gridPos": { "h": 8, @@ -140,8 +140,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_peer_validation_trusted, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_peer_validation_trusted=~\"$validation_trusted\", span_name=\"peer.validation.receive\"}[5m]))", - "legendFormat": "Trusted = {{xrpl_peer_validation_trusted}} [{{exported_instance}}]" + "expr": "sum by (validation_trusted, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", validation_trusted=~\"$validation_trusted\", span_name=\"peer.validation.receive\"}[5m]))", + "legendFormat": "Trusted = {{validation_trusted}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -181,7 +181,7 @@ "label": "Proposal Trusted", "description": "Filter by proposal trust status (true = from trusted validator)", "type": "query", - "query": "label_values(traces_span_metrics_calls_total{span_name=\"peer.proposal.receive\"}, xrpl_peer_proposal_trusted)", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"peer.proposal.receive\"}, proposal_trusted)", "datasource": { "type": "prometheus", "uid": "prometheus" @@ -201,7 +201,7 @@ "label": "Validation Trusted", "description": "Filter by validation trust status (true = from trusted validator)", "type": "query", - "query": "label_values(traces_span_metrics_calls_total{span_name=\"peer.validation.receive\"}, xrpl_peer_validation_trusted)", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"peer.validation.receive\"}, validation_trusted)", "datasource": { "type": "prometheus", "uid": "prometheus" diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json index dec11c506d..7834ec4029 100644 --- a/docker/telemetry/grafana/dashboards/rpc-performance.json +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -10,7 +10,7 @@ "panels": [ { "title": "RPC Request Rate by Command", - "description": "Per-second rate of RPC command executions, broken down by command name (e.g. server_info, submit). Calculated as rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}) over a 5m window, grouped by the xrpl.rpc.command span attribute.", + "description": "Per-second rate of RPC command executions, broken down by command name (e.g. server_info, submit). Calculated as rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}) over a 5m window, grouped by the command span attribute.", "type": "timeseries", "gridPos": { "h": 8, @@ -29,8 +29,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m]))", - "legendFormat": "{{xrpl_rpc_command}} [{{exported_instance}}]" + "expr": "sum by (command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\"}[5m]))", + "legendFormat": "{{command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -49,7 +49,7 @@ }, { "title": "RPC Latency P95 by Command", - "description": "95th percentile response time for each RPC command. Computed from the spanmetrics duration histogram using histogram_quantile(0.95) over rpc.command.* spans, grouped by xrpl.rpc.command. High values indicate slow commands that may need optimization.", + "description": "95th percentile response time for each RPC command. Computed from the spanmetrics duration histogram using histogram_quantile(0.95) over rpc.command.* spans, grouped by command. High values indicate slow commands that may need optimization.", "type": "timeseries", "gridPos": { "h": 8, @@ -68,8 +68,8 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])))", - "legendFormat": "P95 {{xrpl_rpc_command}} [{{exported_instance}}]" + "expr": "histogram_quantile(0.95, sum by (le, command, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])))", + "legendFormat": "P95 {{command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -107,8 +107,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100", - "legendFormat": "{{xrpl_rpc_command}} [{{exported_instance}}]" + "expr": "sum by (command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100", + "legendFormat": "{{command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -158,7 +158,7 @@ "datasource": { "type": "prometheus" }, - "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) by (le)", + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) by (le)", "legendFormat": "{{le}}", "format": "heatmap" } @@ -185,14 +185,14 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=\"rpc.request\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=\"rpc.request\"}[5m]))", "legendFormat": "rpc.request / Sec [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=\"rpc.process\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=\"rpc.process\"}[5m]))", "legendFormat": "rpc.process / Sec [{{exported_instance}}]" } ], @@ -231,14 +231,14 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_UNSET\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_UNSET\"}[5m]))", "legendFormat": "Success [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m]))", "legendFormat": "Error [{{exported_instance}}]" } ], @@ -277,8 +277,8 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, sum by (xrpl_rpc_command, exported_instance) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])))", - "legendFormat": "{{xrpl_rpc_command}} [{{exported_instance}}]" + "expr": "topk(10, sum by (command, exported_instance) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])))", + "legendFormat": "{{command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -309,7 +309,7 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=\"rpc.ws_message\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=\"rpc.ws_message\"}[5m]))", "legendFormat": "WS Messages / Sec [{{exported_instance}}]" } ], @@ -350,7 +350,7 @@ "label": "RPC Command", "description": "Filter by RPC command name (e.g., server_info, submit)", "type": "query", - "query": "label_values(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}, xrpl_rpc_command)", + "query": "label_values(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}, command)", "datasource": { "type": "prometheus", "uid": "prometheus" diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index d8a281988a..1d6a4c0dd0 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -102,7 +102,7 @@ }, { "title": "Transaction Path Distribution", - "description": "Breakdown of transactions by origin path. The xrpl.tx.local attribute indicates whether the transaction was submitted locally (true) or received from a peer (false). Helps understand the ratio of locally-originated vs relayed transactions.", + "description": "Breakdown of transactions by origin path. The local attribute indicates whether the transaction was submitted locally (true) or received from a peer (false). Helps understand the ratio of locally-originated vs relayed transactions.", "type": "piechart", "gridPos": { "h": 8, @@ -121,8 +121,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_tx_local, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_tx_local=~\"$tx_origin\", span_name=\"tx.process\"}[5m]))", - "legendFormat": "Local = {{xrpl_tx_local}} [{{exported_instance}}]" + "expr": "sum by (local, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", local=~\"$tx_origin\", span_name=\"tx.process\"}[5m]))", + "legendFormat": "Local = {{local}} [{{exported_instance}}]" } ] }, @@ -282,7 +282,7 @@ }, { "title": "Transaction Apply Failed Rate", - "description": "Rate of tx.apply spans completing with error status, indicating transaction application failures during ledger building. The span records xrpl.ledger.tx_failed as an attribute. Thresholds: green < 0.1/sec, yellow 0.1-1/sec, red > 1/sec. Some failures are normal (e.g. conflicting offers) but sustained high rates may indicate issues.", + "description": "Rate of tx.apply spans completing with error status, indicating transaction application failures during ledger building. The span records tx_failed as an attribute. Thresholds: green < 0.1/sec, yellow 0.1-1/sec, red > 1/sec. Some failures are normal (e.g. conflicting offers) but sustained high rates may indicate issues.", "type": "stat", "gridPos": { "h": 8, @@ -358,7 +358,7 @@ "label": "TX Origin", "description": "Filter by transaction origin (true = local submit, false = peer relay)", "type": "query", - "query": "label_values(traces_span_metrics_calls_total{span_name=\"tx.process\"}, xrpl_tx_local)", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"tx.process\"}, local)", "datasource": { "type": "prometheus", "uid": "prometheus" diff --git a/include/xrpl/telemetry/SpanNames.h b/include/xrpl/telemetry/SpanNames.h index 88373b5f85..d62b7d1146 100644 --- a/include/xrpl/telemetry/SpanNames.h +++ b/include/xrpl/telemetry/SpanNames.h @@ -122,6 +122,7 @@ inline constexpr auto ledgerSeq = join(join(seg::xrpl, seg::ledger), makeStr("se inline constexpr auto closeTime = makeStr("close_time"); inline constexpr auto closeTimeCorrect = makeStr("close_time_correct"); inline constexpr auto closeResolutionMs = makeStr("close_resolution_ms"); +inline constexpr auto ledgerHash = join(join(seg::xrpl, seg::ledger), makeStr("hash")); } // namespace attr // ===== Shared attribute values ============================================= diff --git a/src/xrpld/app/ledger/detail/BuildLedger.cpp b/src/xrpld/app/ledger/detail/BuildLedger.cpp index 95f72bde15..8548b6a30b 100644 --- a/src/xrpld/app/ledger/detail/BuildLedger.cpp +++ b/src/xrpld/app/ledger/detail/BuildLedger.cpp @@ -82,7 +82,7 @@ buildLedgerImpl( built->header().seq < XRP_LEDGER_EARLIEST_FEES || built->read(keylet::fees()), "xrpl::buildLedgerImpl : valid ledger fees"); built->setAccepted(closeTime, closeResolution, closeTimeCorrect); - buildSpan.setAttribute(ledger_span::attr::seq, static_cast(built->header().seq)); + buildSpan.setAttribute(ledger_span::attr::ledgerSeq, static_cast(built->header().seq)); buildSpan.setAttribute( ledger_span::attr::closeTime, static_cast(closeTime.time_since_epoch().count())); buildSpan.setAttribute(ledger_span::attr::closeTimeCorrect, closeTimeCorrect); diff --git a/src/xrpld/app/ledger/detail/LedgerMaster.cpp b/src/xrpld/app/ledger/detail/LedgerMaster.cpp index 0305ce7c4e..16d35a4709 100644 --- a/src/xrpld/app/ledger/detail/LedgerMaster.cpp +++ b/src/xrpld/app/ledger/detail/LedgerMaster.cpp @@ -454,7 +454,7 @@ LedgerMaster::storeLedger(std::shared_ptr ledger) { using namespace telemetry; auto span = SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::store); - span.setAttribute(ledger_span::attr::seq, static_cast(ledger->header().seq)); + span.setAttribute(ledger_span::attr::ledgerSeq, static_cast(ledger->header().seq)); bool const validated = ledger->header().validated; // Returns true if we already had the ledger @@ -974,7 +974,7 @@ LedgerMaster::checkAccept(std::shared_ptr const& ledger) using namespace telemetry; auto valSpan = SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::validate); - valSpan.setAttribute(ledger_span::attr::seq, static_cast(ledger->header().seq)); + valSpan.setAttribute(ledger_span::attr::ledgerSeq, static_cast(ledger->header().seq)); valSpan.setAttribute(ledger_span::attr::validations, static_cast(tvc)); JLOG(m_journal.info()) << "Advancing accepted ledger to " << ledger->header().seq diff --git a/src/xrpld/app/ledger/detail/LedgerSpanNames.h b/src/xrpld/app/ledger/detail/LedgerSpanNames.h index 4d24a60b2e..a359e5d2c7 100644 --- a/src/xrpld/app/ledger/detail/LedgerSpanNames.h +++ b/src/xrpld/app/ledger/detail/LedgerSpanNames.h @@ -29,22 +29,17 @@ inline constexpr auto apply = makeStr("apply"); // ===== Attribute keys ======================================================== namespace attr { -inline constexpr auto xrplLedger = join(seg::xrpl, seg::ledger); +/// Canonical shared constants (defined in SpanNames.h). +using ::xrpl::telemetry::attr::closeResolutionMs; +using ::xrpl::telemetry::attr::closeTime; +using ::xrpl::telemetry::attr::closeTimeCorrect; +using ::xrpl::telemetry::attr::ledgerHash; +using ::xrpl::telemetry::attr::ledgerSeq; -/// "xrpl.ledger.seq" -inline constexpr auto seq = join(xrplLedger, makeStr("seq")); -/// "xrpl.ledger.close_time" -inline constexpr auto closeTime = join(xrplLedger, makeStr("close_time")); -/// "xrpl.ledger.close_time_correct" -inline constexpr auto closeTimeCorrect = join(xrplLedger, makeStr("close_time_correct")); -/// "xrpl.ledger.close_resolution_ms" -inline constexpr auto closeResolutionMs = join(xrplLedger, makeStr("close_resolution_ms")); -/// "xrpl.ledger.tx_count" -inline constexpr auto txCount = join(xrplLedger, makeStr("tx_count")); -/// "xrpl.ledger.tx_failed" -inline constexpr auto txFailed = join(xrplLedger, makeStr("tx_failed")); -/// "xrpl.ledger.validations" -inline constexpr auto validations = join(xrplLedger, makeStr("validations")); +/// Domain-owned bare attrs. +inline constexpr auto txCount = makeStr("tx_count"); +inline constexpr auto txFailed = makeStr("tx_failed"); +inline constexpr auto validations = makeStr("validations"); } // namespace attr } // namespace xrpl::telemetry::ledger_span diff --git a/src/xrpld/overlay/detail/PeerImp.cpp b/src/xrpld/overlay/detail/PeerImp.cpp index fffa71a9c3..8f74dbe511 100644 --- a/src/xrpld/overlay/detail/PeerImp.cpp +++ b/src/xrpld/overlay/detail/PeerImp.cpp @@ -1875,7 +1875,7 @@ PeerImp::onMessage(std::shared_ptr const& m) { using namespace telemetry; auto span = SpanGuard::span(TraceCategory::Peer, seg::peer, peer_span::op::proposalReceive); - span.setAttribute(peer_span::attr::id, static_cast(id_)); + span.setAttribute(peer_span::attr::peerId, static_cast(id_)); protocol::TMProposeSet const& set = *m; @@ -2484,7 +2484,7 @@ PeerImp::onMessage(std::shared_ptr const& m) using namespace telemetry; auto valSpan = SpanGuard::span(TraceCategory::Peer, seg::peer, peer_span::op::validationReceive); - valSpan.setAttribute(peer_span::attr::id, static_cast(id_)); + valSpan.setAttribute(peer_span::attr::peerId, static_cast(id_)); if (m->validation().size() < 50) { @@ -2508,8 +2508,7 @@ PeerImp::onMessage(std::shared_ptr const& m) false); val->setSeen(closeTime); } - valSpan.setAttribute( - peer_span::attr::validationLedgerHash, to_string(val->getLedgerHash()).c_str()); + valSpan.setAttribute(peer_span::attr::ledgerHash, to_string(val->getLedgerHash()).c_str()); valSpan.setAttribute(peer_span::attr::validationFull, val->isFull()); if (!isCurrent( diff --git a/src/xrpld/overlay/detail/PeerSpanNames.h b/src/xrpld/overlay/detail/PeerSpanNames.h index 9697ea3fa4..fd2081f778 100644 --- a/src/xrpld/overlay/detail/PeerSpanNames.h +++ b/src/xrpld/overlay/detail/PeerSpanNames.h @@ -25,22 +25,14 @@ inline constexpr auto validationReceive = makeStr("validation.receive"); // ===== Attribute keys ======================================================== namespace attr { -inline constexpr auto xrplPeer = join(seg::xrpl, seg::peer); +/// Canonical shared constants (defined in SpanNames.h). +using ::xrpl::telemetry::attr::ledgerHash; +using ::xrpl::telemetry::attr::peerId; -/// "xrpl.peer.id" -inline constexpr auto id = join(xrplPeer, makeStr("id")); -/// "xrpl.peer.proposal.trusted" -inline constexpr auto proposalTrusted = - join(join(xrplPeer, makeStr("proposal")), makeStr("trusted")); - -/// "xrpl.peer.validation.ledger_hash" -inline constexpr auto validationLedgerHash = - join(join(xrplPeer, makeStr("validation")), makeStr("ledger_hash")); -/// "xrpl.peer.validation.full" -inline constexpr auto validationFull = join(join(xrplPeer, makeStr("validation")), makeStr("full")); -/// "xrpl.peer.validation.trusted" -inline constexpr auto validationTrusted = - join(join(xrplPeer, makeStr("validation")), makeStr("trusted")); +/// Domain-owned bare attrs. +inline constexpr auto proposalTrusted = makeStr("proposal_trusted"); +inline constexpr auto validationFull = makeStr("validation_full"); +inline constexpr auto validationTrusted = makeStr("validation_trusted"); } // namespace attr } // namespace xrpl::telemetry::peer_span