diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 0ba69c139e..d40f42cc58 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -10,7 +10,7 @@ "panels": [ { "title": "Consensus Round Duration", - "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp:395) measures the time to process an accepted ledger including transaction application and state finalization. The span carries xrpl.consensus.proposers and xrpl.consensus.round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", + "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp:395) measures the time to process an accepted ledger including transaction application and state finalization. The span carries proposers and round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", "type": "timeseries", "gridPos": { "h": 8, @@ -95,7 +95,7 @@ }, { "title": "Ledger Close Duration", - "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp:282) measures the time from when consensus triggers a ledger close to completion. Carries xrpl.consensus.ledger.seq and xrpl.consensus.mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", + "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp:282) measures the time from when consensus triggers a ledger close to completion. Carries xrpl.ledger.seq and xrpl.consensus.mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", "type": "timeseries", "gridPos": { "h": 8, @@ -134,7 +134,7 @@ }, { "title": "Validation Send Rate", - "description": "Rate at which this node sends ledger validations to the network. Sourced from the consensus.validation.send span (RCLConsensus.cpp:753). Each validation confirms the node has fully validated a ledger. The span carries xrpl.consensus.ledger.seq and xrpl.consensus.proposing. Should closely track the ledger close rate when the node is healthy.", + "description": "Rate at which this node sends ledger validations to the network. Sourced from the consensus.validation.send span (RCLConsensus.cpp:753). Each validation confirms the node has fully validated a ledger. The span carries xrpl.ledger.seq and proposing. Should closely track the ledger close rate when the node is healthy.", "type": "stat", "gridPos": { "h": 8, @@ -206,7 +206,7 @@ }, { "title": "Close Time Agreement", - "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on xrpl.consensus.close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).", + "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).", "type": "timeseries", "gridPos": { "h": 8, @@ -219,8 +219,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_consensus_close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", xrpl_consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))", - "legendFormat": "Close Time Correct={{xrpl_consensus_close_time_correct}} [{{exported_instance}}]" + "expr": "sum by (close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", xrpl_consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))", + "legendFormat": "Close Time Correct={{close_time_correct}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -400,7 +400,7 @@ }, { "title": "Close Time: Raw Proposals (Per Node)", - "description": "Each node's raw proposed close time (xrpl.consensus.close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift.", + "description": "Each node's raw proposed close time (close_time_self) \u2014 the unrounded wall clock value at the moment the node closed its ledger. Compare across nodes to see clock drift.", "type": "timeseries", "gridPos": { "h": 8, @@ -436,14 +436,14 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\"} | select(span.xrpl.consensus.close_time_self)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time_self)", "refId": "A" } ] }, { "title": "Close Time: Effective / Quantized", - "description": "The consensus-agreed close time after rounding to the current resolution bin (xrpl.consensus.close_time). This is the value written to the ledger header. All nodes in agreement produce the same value.", + "description": "The consensus-agreed close time after rounding to the current resolution bin (close_time). This is the value written to the ledger header. All nodes in agreement produce the same value.", "type": "timeseries", "gridPos": { "h": 8, @@ -479,14 +479,14 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\"} | select(span.xrpl.consensus.close_time)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time)", "refId": "A" } ] }, { "title": "Close Time Vote Bins & Resolution", - "description": "Number of distinct close time vote bins (xrpl.consensus.close_time_vote_bins) and the bin size / resolution in ms (xrpl.consensus.close_resolution_ms). More bins = more clock disagreement. Resolution adapts: finer (10s) when validators agree, coarser (120s) when they disagree.", + "description": "Number of distinct close time vote bins (close_time_vote_bins) and the bin size / resolution in ms (close_resolution_ms). More bins = more clock disagreement. Resolution adapts: finer (10s) when validators agree, coarser (120s) when they disagree.", "type": "timeseries", "gridPos": { "h": 8, @@ -555,7 +555,7 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\"} | select(span.xrpl.consensus.close_time_vote_bins)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time_vote_bins)", "refId": "A" }, { @@ -563,14 +563,14 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\"} | select(span.xrpl.consensus.close_resolution_ms)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_resolution_ms)", "refId": "B" } ] }, { "title": "Close Time Resolution Direction", - "description": "Whether close time resolution increased (coarser bins, more disagreement), decreased (finer bins, better agreement), or stayed unchanged relative to the previous ledger. Based on xrpl.consensus.resolution_direction attribute.", + "description": "Whether close time resolution increased (coarser bins, more disagreement), decreased (finer bins, better agreement), or stayed unchanged relative to the previous ledger. Based on resolution_direction attribute.", "type": "timeseries", "gridPos": { "h": 8, @@ -606,7 +606,7 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\" && span.xrpl.consensus.resolution_direction=~\"$resolution_direction\"} | select(span.xrpl.consensus.resolution_direction)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\" && span.resolution_direction=~\"$resolution_direction\"} | select(span.resolution_direction)", "refId": "A" } ] @@ -650,7 +650,7 @@ "type": "tempo" }, "queryType": "traceql", - "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.xrpl.consensus.close_time_correct=~\"$close_time_correct\"} | select(span.xrpl.consensus.close_time, span.xrpl.consensus.close_time_vote_bins)", + "query": "{name=\"consensus.accept.apply\" && resource.service.instance.id=~\"$node\" && span.close_time_correct=~\"$close_time_correct\"} | select(span.close_time, span.close_time_vote_bins)", "refId": "A" } ] diff --git a/docker/telemetry/grafana/dashboards/ledger-operations.json b/docker/telemetry/grafana/dashboards/ledger-operations.json index 67711e4fa8..c9c8c5efc3 100644 --- a/docker/telemetry/grafana/dashboards/ledger-operations.json +++ b/docker/telemetry/grafana/dashboards/ledger-operations.json @@ -88,7 +88,7 @@ }, { "title": "Ledger Validation Rate", - "description": "Rate at which ledgers pass the validation threshold and are accepted as fully validated. The ledger.validate span (LedgerMaster.cpp:915) fires in checkAccept() only after the ledger receives sufficient trusted validations (>= quorum). Records xrpl.ledger.seq and xrpl.ledger.validations (the number of validations received).", + "description": "Rate at which ledgers pass the validation threshold and are accepted as fully validated. The ledger.validate span (LedgerMaster.cpp:915) fires in checkAccept() only after the ledger receives sufficient trusted validations (>= quorum). Records xrpl.ledger.seq and validations (the number of validations received).", "type": "stat", "gridPos": { "h": 8, @@ -156,7 +156,7 @@ }, { "title": "Transaction Apply Duration", - "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp:88) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records xrpl.ledger.tx_count (successful) and xrpl.ledger.tx_failed (failed) as attributes.", + "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp:88) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records tx_count (successful) and tx_failed (failed) as attributes.", "type": "timeseries", "gridPos": { "h": 8, diff --git a/docker/telemetry/grafana/dashboards/peer-network.json b/docker/telemetry/grafana/dashboards/peer-network.json index 9740b04366..0fd6e6048f 100644 --- a/docker/telemetry/grafana/dashboards/peer-network.json +++ b/docker/telemetry/grafana/dashboards/peer-network.json @@ -11,7 +11,7 @@ "panels": [ { "title": "Peer Proposal Receive Rate", - "description": "Rate of consensus proposals received from network peers. The peer.proposal.receive span (PeerImp.cpp:1667) fires in onMessage(TMProposeSet) for each incoming proposal. Records xrpl.peer.id (sending peer) and xrpl.peer.proposal.trusted (whether the proposer is in our UNL). Requires trace_peer=1 in the telemetry config.", + "description": "Rate of consensus proposals received from network peers. The peer.proposal.receive span (PeerImp.cpp:1667) fires in onMessage(TMProposeSet) for each incoming proposal. Records xrpl.peer.id (sending peer) and proposal_trusted (whether the proposer is in our UNL). Requires trace_peer=1 in the telemetry config.", "type": "timeseries", "gridPos": { "h": 8, @@ -50,7 +50,7 @@ }, { "title": "Peer Validation Receive Rate", - "description": "Rate of ledger validations received from network peers. The peer.validation.receive span (PeerImp.cpp:2264) fires in onMessage(TMValidation) for each incoming validation message. Records xrpl.peer.id (sending peer) and xrpl.peer.validation.trusted (whether the validator is trusted). Requires trace_peer=1 in the telemetry config.", + "description": "Rate of ledger validations received from network peers. The peer.validation.receive span (PeerImp.cpp:2264) fires in onMessage(TMValidation) for each incoming validation message. Records xrpl.peer.id (sending peer) and validation_trusted (whether the validator is trusted). Requires trace_peer=1 in the telemetry config.", "type": "timeseries", "gridPos": { "h": 8, @@ -89,7 +89,7 @@ }, { "title": "Proposals Trusted vs Untrusted", - "description": "Pie chart showing the ratio of proposals received from trusted validators (in our UNL) vs untrusted validators. Grouped by the xrpl.peer.proposal.trusted span attribute (true/false). A healthy node connected to a well-configured UNL should see a significant portion of trusted proposals. Note: proposals that fail early validation may not have the trusted attribute set.", + "description": "Pie chart showing the ratio of proposals received from trusted validators (in our UNL) vs untrusted validators. Grouped by the proposal_trusted span attribute (true/false). A healthy node connected to a well-configured UNL should see a significant portion of trusted proposals. Note: proposals that fail early validation may not have the trusted attribute set.", "type": "piechart", "gridPos": { "h": 8, @@ -108,8 +108,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_peer_proposal_trusted, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_peer_proposal_trusted=~\"$proposal_trusted\", span_name=\"peer.proposal.receive\"}[5m]))", - "legendFormat": "Trusted = {{xrpl_peer_proposal_trusted}} [{{exported_instance}}]" + "expr": "sum by (proposal_trusted, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", proposal_trusted=~\"$proposal_trusted\", span_name=\"peer.proposal.receive\"}[5m]))", + "legendFormat": "Trusted = {{proposal_trusted}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -121,7 +121,7 @@ }, { "title": "Validations Trusted vs Untrusted", - "description": "Pie chart showing the ratio of validations received from trusted validators (in our UNL) vs untrusted validators. Grouped by the xrpl.peer.validation.trusted span attribute (true/false). Monitoring this helps detect if the node is receiving validations from the expected set of trusted validators. Note: validations that fail early checks may not have the trusted attribute set.", + "description": "Pie chart showing the ratio of validations received from trusted validators (in our UNL) vs untrusted validators. Grouped by the validation_trusted span attribute (true/false). Monitoring this helps detect if the node is receiving validations from the expected set of trusted validators. Note: validations that fail early checks may not have the trusted attribute set.", "type": "piechart", "gridPos": { "h": 8, @@ -140,8 +140,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_peer_validation_trusted, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_peer_validation_trusted=~\"$validation_trusted\", span_name=\"peer.validation.receive\"}[5m]))", - "legendFormat": "Trusted = {{xrpl_peer_validation_trusted}} [{{exported_instance}}]" + "expr": "sum by (validation_trusted, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", validation_trusted=~\"$validation_trusted\", span_name=\"peer.validation.receive\"}[5m]))", + "legendFormat": "Trusted = {{validation_trusted}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -181,7 +181,7 @@ "label": "Proposal Trusted", "description": "Filter by proposal trust status (true = from trusted validator)", "type": "query", - "query": "label_values(traces_span_metrics_calls_total{span_name=\"peer.proposal.receive\"}, xrpl_peer_proposal_trusted)", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"peer.proposal.receive\"}, proposal_trusted)", "datasource": { "type": "prometheus", "uid": "prometheus" @@ -201,7 +201,7 @@ "label": "Validation Trusted", "description": "Filter by validation trust status (true = from trusted validator)", "type": "query", - "query": "label_values(traces_span_metrics_calls_total{span_name=\"peer.validation.receive\"}, xrpl_peer_validation_trusted)", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"peer.validation.receive\"}, validation_trusted)", "datasource": { "type": "prometheus", "uid": "prometheus" diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json index dec11c506d..7834ec4029 100644 --- a/docker/telemetry/grafana/dashboards/rpc-performance.json +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -10,7 +10,7 @@ "panels": [ { "title": "RPC Request Rate by Command", - "description": "Per-second rate of RPC command executions, broken down by command name (e.g. server_info, submit). Calculated as rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}) over a 5m window, grouped by the xrpl.rpc.command span attribute.", + "description": "Per-second rate of RPC command executions, broken down by command name (e.g. server_info, submit). Calculated as rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}) over a 5m window, grouped by the command span attribute.", "type": "timeseries", "gridPos": { "h": 8, @@ -29,8 +29,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m]))", - "legendFormat": "{{xrpl_rpc_command}} [{{exported_instance}}]" + "expr": "sum by (command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\"}[5m]))", + "legendFormat": "{{command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -49,7 +49,7 @@ }, { "title": "RPC Latency P95 by Command", - "description": "95th percentile response time for each RPC command. Computed from the spanmetrics duration histogram using histogram_quantile(0.95) over rpc.command.* spans, grouped by xrpl.rpc.command. High values indicate slow commands that may need optimization.", + "description": "95th percentile response time for each RPC command. Computed from the spanmetrics duration histogram using histogram_quantile(0.95) over rpc.command.* spans, grouped by command. High values indicate slow commands that may need optimization.", "type": "timeseries", "gridPos": { "h": 8, @@ -68,8 +68,8 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])))", - "legendFormat": "P95 {{xrpl_rpc_command}} [{{exported_instance}}]" + "expr": "histogram_quantile(0.95, sum by (le, command, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])))", + "legendFormat": "P95 {{command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -107,8 +107,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100", - "legendFormat": "{{xrpl_rpc_command}} [{{exported_instance}}]" + "expr": "sum by (command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100", + "legendFormat": "{{command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -158,7 +158,7 @@ "datasource": { "type": "prometheus" }, - "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) by (le)", + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) by (le)", "legendFormat": "{{le}}", "format": "heatmap" } @@ -185,14 +185,14 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=\"rpc.request\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=\"rpc.request\"}[5m]))", "legendFormat": "rpc.request / Sec [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=\"rpc.process\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=\"rpc.process\"}[5m]))", "legendFormat": "rpc.process / Sec [{{exported_instance}}]" } ], @@ -231,14 +231,14 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_UNSET\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_UNSET\"}[5m]))", "legendFormat": "Success [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m]))", "legendFormat": "Error [{{exported_instance}}]" } ], @@ -277,8 +277,8 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, sum by (xrpl_rpc_command, exported_instance) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])))", - "legendFormat": "{{xrpl_rpc_command}} [{{exported_instance}}]" + "expr": "topk(10, sum by (command, exported_instance) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])))", + "legendFormat": "{{command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -309,7 +309,7 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=\"rpc.ws_message\"}[5m]))", + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", command=~\"$command\", span_name=\"rpc.ws_message\"}[5m]))", "legendFormat": "WS Messages / Sec [{{exported_instance}}]" } ], @@ -350,7 +350,7 @@ "label": "RPC Command", "description": "Filter by RPC command name (e.g., server_info, submit)", "type": "query", - "query": "label_values(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}, xrpl_rpc_command)", + "query": "label_values(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}, command)", "datasource": { "type": "prometheus", "uid": "prometheus" diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index d8a281988a..1d6a4c0dd0 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -102,7 +102,7 @@ }, { "title": "Transaction Path Distribution", - "description": "Breakdown of transactions by origin path. The xrpl.tx.local attribute indicates whether the transaction was submitted locally (true) or received from a peer (false). Helps understand the ratio of locally-originated vs relayed transactions.", + "description": "Breakdown of transactions by origin path. The local attribute indicates whether the transaction was submitted locally (true) or received from a peer (false). Helps understand the ratio of locally-originated vs relayed transactions.", "type": "piechart", "gridPos": { "h": 8, @@ -121,8 +121,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_tx_local, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_tx_local=~\"$tx_origin\", span_name=\"tx.process\"}[5m]))", - "legendFormat": "Local = {{xrpl_tx_local}} [{{exported_instance}}]" + "expr": "sum by (local, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", local=~\"$tx_origin\", span_name=\"tx.process\"}[5m]))", + "legendFormat": "Local = {{local}} [{{exported_instance}}]" } ] }, @@ -282,7 +282,7 @@ }, { "title": "Transaction Apply Failed Rate", - "description": "Rate of tx.apply spans completing with error status, indicating transaction application failures during ledger building. The span records xrpl.ledger.tx_failed as an attribute. Thresholds: green < 0.1/sec, yellow 0.1-1/sec, red > 1/sec. Some failures are normal (e.g. conflicting offers) but sustained high rates may indicate issues.", + "description": "Rate of tx.apply spans completing with error status, indicating transaction application failures during ledger building. The span records tx_failed as an attribute. Thresholds: green < 0.1/sec, yellow 0.1-1/sec, red > 1/sec. Some failures are normal (e.g. conflicting offers) but sustained high rates may indicate issues.", "type": "stat", "gridPos": { "h": 8, @@ -358,7 +358,7 @@ "label": "TX Origin", "description": "Filter by transaction origin (true = local submit, false = peer relay)", "type": "query", - "query": "label_values(traces_span_metrics_calls_total{span_name=\"tx.process\"}, xrpl_tx_local)", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"tx.process\"}, local)", "datasource": { "type": "prometheus", "uid": "prometheus" diff --git a/docs/build/telemetry.md b/docs/build/telemetry.md index e8d7fb2dd5..1e6e715353 100644 --- a/docs/build/telemetry.md +++ b/docs/build/telemetry.md @@ -185,15 +185,15 @@ Traced RPC operations produce a span hierarchy like: ``` rpc.request - └── rpc.command.server_info (xrpl.rpc.command=server_info, xrpl.rpc.status=success) + └── rpc.command.server_info (command=server_info, rpc_status=success) ``` Each span includes attributes: -- `xrpl.rpc.command` — the RPC method name -- `xrpl.rpc.version` — API version -- `xrpl.rpc.role` — `admin` or `user` -- `xrpl.rpc.status` — `success` or `error` +- `command` — the RPC method name +- `version` — API version +- `rpc_role` — `admin` or `user` +- `rpc_status` — `success` or `error` ## Running Tests diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index e4a486ca00..175154d686 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -65,71 +65,71 @@ All spans instrumented in xrpld, grouped by subsystem: ### RPC Spans (Phase 2) -| Span Name | Source File | Attributes | Description | -| -------------------- | --------------------- | ---------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------- | -| `rpc.request` | ServerHandler.cpp:271 | — | Top-level HTTP RPC request | -| `rpc.process` | ServerHandler.cpp:573 | — | RPC processing (child of rpc.request) | -| `rpc.ws_message` | ServerHandler.cpp:384 | — | WebSocket RPC message | -| `rpc.command.` | RPCHandler.cpp:161 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.status`, `xrpl.rpc.duration_ms`, `xrpl.rpc.error_message` | Per-command span (e.g., `rpc.command.server_info`) | +| Span Name | Source File | Attributes | Description | +| -------------------- | --------------------- | ------------------------------------------------------------------------------ | -------------------------------------------------- | +| `rpc.request` | ServerHandler.cpp:271 | — | Top-level HTTP RPC request | +| `rpc.process` | ServerHandler.cpp:573 | — | RPC processing (child of rpc.request) | +| `rpc.ws_message` | ServerHandler.cpp:384 | — | WebSocket RPC message | +| `rpc.command.` | RPCHandler.cpp:161 | `command`, `version`, `rpc_role`, `rpc_status`, `duration_ms`, `error_message` | Per-command span (e.g., `rpc.command.server_info`) | ### Transaction Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------ | ------------------- | ------------------------------------------------------------------------------------------- | ------------------------------------- | -| `tx.process` | NetworkOPs.cpp:1227 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Transaction submission and processing | -| `tx.receive` | PeerImp.cpp:1273 | `xrpl.peer.id`, `xrpl.tx.hash`, `xrpl.peer.version`, `xrpl.tx.suppressed`, `xrpl.tx.status` | Transaction received from peer relay | -| `tx.apply` | BuildLedger.cpp:88 | `xrpl.ledger.seq`, `xrpl.ledger.tx_count`, `xrpl.ledger.tx_failed` | Transaction set applied per ledger | +| Span Name | Source File | Attributes | Description | +| ------------ | ------------------- | ------------------------------------------------------------------------- | ------------------------------------- | +| `tx.process` | NetworkOPs.cpp:1227 | `xrpl.tx.hash`, `local`, `path` | Transaction submission and processing | +| `tx.receive` | PeerImp.cpp:1273 | `xrpl.peer.id`, `xrpl.tx.hash`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | +| `tx.apply` | BuildLedger.cpp:88 | `xrpl.ledger.seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | ### Transaction Queue Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------------ | ----------- | --------------------------------------------------------------------- | -------------------------------------------------- | -| `txq.enqueue` | TxQ.cpp | `xrpl.txq.tx_hash` | Transaction enqueue decision (child of tx.process) | -| `txq.apply_direct` | TxQ.cpp | -- | Direct apply attempt (bypassing queue) | -| `txq.batch_clear` | TxQ.cpp | -- | Batch clear of queued transactions for an account | -| `txq.accept` | TxQ.cpp | `xrpl.txq.queue_size` | Ledger-close accept loop over queued transactions | -| `txq.accept_tx` | TxQ.cpp | `xrpl.txq.tx_hash`, `xrpl.txq.retries_remaining`, `xrpl.txq.ter_code` | Per-transaction apply during accept | -| `txq.cleanup` | TxQ.cpp | `xrpl.txq.ledger_seq` | Post-close cleanup of expired queue entries | +| Span Name | Source File | Attributes | Description | +| ------------------ | ----------- | ----------------------------------------------- | -------------------------------------------------- | +| `txq.enqueue` | TxQ.cpp | `xrpl.tx.hash` | Transaction enqueue decision (child of tx.process) | +| `txq.apply_direct` | TxQ.cpp | -- | Direct apply attempt (bypassing queue) | +| `txq.batch_clear` | TxQ.cpp | -- | Batch clear of queued transactions for an account | +| `txq.accept` | TxQ.cpp | `queue_size` | Ledger-close accept loop over queued transactions | +| `txq.accept_tx` | TxQ.cpp | `xrpl.tx.hash`, `retries_remaining`, `ter_code` | Per-transaction apply during accept | +| `txq.cleanup` | TxQ.cpp | `xrpl.ledger.seq` | Post-close cleanup of expired queue entries | ### Consensus Spans (Phase 4) -| Span Name | Source File | Attributes | Description | -| ------------------------------ | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------- | -| `consensus.round` | RCLConsensus.cpp | `xrpl.consensus.ledger_id`, `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode`, `xrpl.consensus.trace_strategy`, `xrpl.consensus.round_id` | Root span for a consensus round (deterministic or random trace ID) | -| `consensus.phase.open` | Consensus.h | -- | Open phase duration (child of round) | -| `consensus.proposal.send` | RCLConsensus.cpp | `xrpl.consensus.round` | Consensus proposal broadcast | -| `consensus.ledger_close` | RCLConsensus.cpp | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | -| `consensus.establish` | Consensus.h | `xrpl.consensus.converge_percent`, `xrpl.consensus.establish_count`, `xrpl.consensus.proposers` | Establish phase duration (child of round) | -| `consensus.update_positions` | Consensus.h | `xrpl.consensus.converge_percent`, `xrpl.consensus.proposers`, `xrpl.consensus.disputes_count` | Position update and dispute resolution (see Events below) | -| `consensus.check` | Consensus.h | `xrpl.consensus.agree_count`, `xrpl.consensus.disagree_count`, `xrpl.consensus.converge_percent`, `xrpl.consensus.have_close_time_consensus`, `xrpl.consensus.threshold_percent`, `xrpl.consensus.result` | Consensus threshold check | -| `consensus.accept` | RCLConsensus.cpp | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms`, `xrpl.consensus.quorum` | Ledger accepted by consensus | -| `consensus.accept.apply` | RCLConsensus.cpp | `xrpl.consensus.ledger.seq`, `xrpl.consensus.close_time`, `xrpl.consensus.close_time_correct`, `xrpl.consensus.close_resolution_ms`, `xrpl.consensus.state`, `xrpl.consensus.proposing`, `xrpl.consensus.round_time_ms`, `xrpl.consensus.parent_close_time`, `xrpl.consensus.close_time_self`, `xrpl.consensus.close_time_vote_bins`, `xrpl.consensus.resolution_direction`, `xrpl.consensus.tx_count` | Ledger application with close time details (see Events below) | -| `consensus.validation.send` | RCLConsensus.cpp | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent after accept (follows-from link) | -| `consensus.mode_change` | RCLConsensus.cpp | `xrpl.consensus.mode.old`, `xrpl.consensus.mode.new` | Consensus mode transition | -| `consensus.proposal.receive` | PeerImp.cpp | `xrpl.consensus.trusted`, `xrpl.consensus.round` | Proposal received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | -| `consensus.validation.receive` | PeerImp.cpp | `xrpl.consensus.trusted`, `xrpl.consensus.ledger.seq` | Validation received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | +| Span Name | Source File | Attributes | Description | +| ------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `consensus.round` | RCLConsensus.cpp | `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`, `xrpl.consensus.mode`, `trace_strategy`, `xrpl.consensus.round_id` | Root span for a consensus round (deterministic or random trace ID) | +| `consensus.phase.open` | Consensus.h | -- | Open phase duration (child of round) | +| `consensus.proposal.send` | RCLConsensus.cpp | `xrpl.consensus.round` | Consensus proposal broadcast | +| `consensus.ledger_close` | RCLConsensus.cpp | `xrpl.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | +| `consensus.establish` | Consensus.h | `converge_percent`, `establish_count`, `proposers` | Establish phase duration (child of round) | +| `consensus.update_positions` | Consensus.h | `converge_percent`, `proposers`, `disputes_count` | Position update and dispute resolution (see Events below) | +| `consensus.check` | Consensus.h | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `consensus_result` | Consensus threshold check | +| `consensus.accept` | RCLConsensus.cpp | `proposers`, `round_time_ms`, `quorum` | Ledger accepted by consensus | +| `consensus.accept.apply` | RCLConsensus.cpp | `xrpl.ledger.seq`, `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction`, `tx_count` | Ledger application with close time details (see Events below) | +| `consensus.validation.send` | RCLConsensus.cpp | `xrpl.ledger.seq`, `proposing` | Validation sent after accept (follows-from link) | +| `consensus.mode_change` | RCLConsensus.cpp | `mode_old`, `mode_new` | Consensus mode transition | +| `consensus.proposal.receive` | PeerImp.cpp | `trusted`, `xrpl.consensus.round` | Proposal received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | +| `consensus.validation.receive` | PeerImp.cpp | `trusted`, `xrpl.ledger.seq` | Validation received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | #### Consensus Span Events -| Parent Span | Event Name | Event Attributes | Description | -| ---------------------------- | ----------------- | ------------------------------------------------------------------------------- | ------------------------------------------------------- | -| `consensus.update_positions` | `dispute.resolve` | `xrpl.tx.id`, `xrpl.dispute.our_vote`, `xrpl.dispute.yays`, `xrpl.dispute.nays` | Emitted per dispute when votes are tallied | -| `consensus.accept.apply` | `tx.included` | `xrpl.tx.id` | Emitted per transaction included in the accepted ledger | +| Parent Span | Event Name | Event Attributes | Description | +| ---------------------------- | ----------------- | ---------------------------------------------------------------- | ------------------------------------------------------- | +| `consensus.update_positions` | `dispute.resolve` | `xrpl.tx.id`, `dispute_our_vote`, `dispute_yays`, `dispute_nays` | Emitted per dispute when votes are tallied | +| `consensus.accept.apply` | `tx.included` | `xrpl.tx.id` | Emitted per transaction included in the accepted ledger | #### Close Time Queries (Tempo TraceQL) ``` # Find rounds where validators disagreed on close time -{name="consensus.accept.apply"} | xrpl.consensus.close_time_correct = false +{name="consensus.accept.apply"} | close_time_correct = false # Find consensus failures (moved_on) -{name="consensus.accept.apply"} | xrpl.consensus.state = "moved_on" +{name="consensus.accept.apply"} | consensus_state = "moved_on" # Find slow ledger applications (>5s) {name="consensus.accept.apply"} | duration > 5s # Find specific ledger's consensus details -{name="consensus.accept.apply"} | xrpl.consensus.ledger.seq = 92345678 +{name="consensus.accept.apply"} | xrpl.ledger.seq = 92345678 # Find all spans in a consensus round (deterministic trace strategy) {name="consensus.round"} | xrpl.consensus.round_id = "" @@ -140,18 +140,18 @@ All spans instrumented in xrpld, grouped by subsystem: ### Ledger Spans (Phase 5) -| Span Name | Source File | Attributes | Description | -| ----------------- | -------------------- | ------------------------------------------------------------------ | ----------------------------- | -| `ledger.build` | BuildLedger.cpp:31 | `xrpl.ledger.seq`, `xrpl.ledger.tx_count`, `xrpl.ledger.tx_failed` | Ledger build during consensus | -| `ledger.validate` | LedgerMaster.cpp:915 | `xrpl.ledger.seq`, `xrpl.ledger.validations` | Ledger promoted to validated | -| `ledger.store` | LedgerMaster.cpp:409 | `xrpl.ledger.seq` | Ledger stored in history | +| Span Name | Source File | Attributes | Description | +| ----------------- | -------------------- | ------------------------------------------ | ----------------------------- | +| `ledger.build` | BuildLedger.cpp:31 | `xrpl.ledger.seq`, `tx_count`, `tx_failed` | Ledger build during consensus | +| `ledger.validate` | LedgerMaster.cpp:915 | `xrpl.ledger.seq`, `validations` | Ledger promoted to validated | +| `ledger.store` | LedgerMaster.cpp:409 | `xrpl.ledger.seq` | Ledger stored in history | ### Peer Spans (Phase 5) -| Span Name | Source File | Attributes | Description | -| ------------------------- | ---------------- | ---------------------------------------------- | ----------------------------- | -| `peer.proposal.receive` | PeerImp.cpp:1667 | `xrpl.peer.id`, `xrpl.peer.proposal.trusted` | Proposal received from peer | -| `peer.validation.receive` | PeerImp.cpp:2264 | `xrpl.peer.id`, `xrpl.peer.validation.trusted` | Validation received from peer | +| Span Name | Source File | Attributes | Description | +| ------------------------- | ---------------- | ------------------------------------ | ----------------------------- | +| `peer.proposal.receive` | PeerImp.cpp:1667 | `xrpl.peer.id`, `proposal_trusted` | Proposal received from peer | +| `peer.validation.receive` | PeerImp.cpp:2264 | `xrpl.peer.id`, `validation_trusted` | Validation received from peer | ## Cross-Node Trace Propagation @@ -260,14 +260,14 @@ Every metric carries these standard labels: Additionally, span attributes configured as dimensions in the collector become metric labels (dots → underscores): -| Span Attribute | Metric Label | Applies To | -| ------------------------------ | ------------------------------ | ------------------------------- | -| `xrpl.rpc.command` | `xrpl_rpc_command` | `rpc.command.*` spans | -| `xrpl.rpc.status` | `xrpl_rpc_status` | `rpc.command.*` spans | -| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` spans | -| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` spans | -| `xrpl.peer.proposal.trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` spans | -| `xrpl.peer.validation.trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` spans | +| Span Attribute | Metric Label | Applies To | +| --------------------- | ------------------------------ | ------------------------------- | +| `command` | `xrpl_rpc_command` | `rpc.command.*` spans | +| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` spans | +| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` spans | +| `local` | `xrpl_tx_local` | `tx.process` spans | +| `proposal_trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` spans | +| `validation_trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` spans | ### Histogram Buckets diff --git a/include/xrpl/telemetry/SpanGuard.h b/include/xrpl/telemetry/SpanGuard.h index 097eae2312..d54ce408f9 100644 --- a/include/xrpl/telemetry/SpanGuard.h +++ b/include/xrpl/telemetry/SpanGuard.h @@ -53,7 +53,7 @@ auto span = SpanGuard::span( TraceCategory::Rpc, rpc_span::prefix::command, "submit"); span.setAttribute(rpc_span::attr::command, "submit"); - span.setAttribute(rpc_span::attr::status, rpc_span::val::success); + span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success); // span ended automatically on scope exit @endcode @@ -86,7 +86,7 @@ TraceCategory::Rpc, rpc_span::prefix::rpc, "request"); if (span) { // expensive attribute computation only when active - span.setAttribute(rpc_span::attr::payloadSize, computeSize()); + span.setAttribute(rpc_span::attr::requestPayloadSize, computeSize()); } @endcode diff --git a/include/xrpl/telemetry/SpanNames.h b/include/xrpl/telemetry/SpanNames.h index 9bcae7bde1..d62b7d1146 100644 --- a/include/xrpl/telemetry/SpanNames.h +++ b/include/xrpl/telemetry/SpanNames.h @@ -16,9 +16,12 @@ * concatenation support. boost::static_string is not constexpr. * StaticStr exists specifically for compile-time dot-join composition. * - * Naming conventions follow OpenTelemetry semantic conventions: - * - Attribute keys: "xrpl.." - * - Span prefixes: "[.]" + * Naming conventions (see spec 2026-05-13-span-attr-naming-design): + * - Per-span attribute keys: bare field name (span name carries the domain). + * - Collision qualifier: _ when bare name collides across + * domains or with OTel reserved `status` (e.g. rpc_status, grpc_status). + * - Resource attribute keys: xrpl.. (process-identity). + * - Span prefixes: [.]. */ #include @@ -98,14 +101,28 @@ inline constexpr auto link = makeStr("link"); namespace attr { inline constexpr auto networkId = join(join(seg::xrpl, seg::network), makeStr("id")); inline constexpr auto networkType = join(join(seg::xrpl, seg::network), makeStr("type")); -inline constexpr auto linkType = join(join(seg::xrpl, seg::link), makeStr("type")); +inline constexpr auto linkType = makeStr("link_type"); -/// Node health attributes (cross-cutting, used by RPC/consensus/tx spans). +/// Node health attributes — RESOURCE-ONLY (process identity, not per-span). +/// Set at Tracer init via resource::Resource::Create and refreshed on state +/// transitions. Do NOT use with span.setAttribute(). inline constexpr auto xrplNode = join(seg::xrpl, makeStr("node")); -/// "xrpl.node.amendment_blocked" +/// "xrpl.node.amendment_blocked" — resource attribute key. inline constexpr auto nodeAmendmentBlocked = join(xrplNode, makeStr("amendment_blocked")); -/// "xrpl.node.server_state" +/// "xrpl.node.server_state" — resource attribute key. inline constexpr auto nodeServerState = join(xrplNode, makeStr("server_state")); + +/// Canonical shared attrs (rule 5 — kept xrpl..* form). +/// Defined once here, aliased by domain-specific headers. +inline constexpr auto txHash = join(join(seg::xrpl, seg::tx), makeStr("hash")); +inline constexpr auto peerId = join(join(seg::xrpl, seg::peer), makeStr("id")); +inline constexpr auto ledgerSeq = join(join(seg::xrpl, seg::ledger), makeStr("seq")); + +/// Shared close-time attrs — bare names, reused by consensus and ledger. +inline constexpr auto closeTime = makeStr("close_time"); +inline constexpr auto closeTimeCorrect = makeStr("close_time_correct"); +inline constexpr auto closeResolutionMs = makeStr("close_resolution_ms"); +inline constexpr auto ledgerHash = join(join(seg::xrpl, seg::ledger), makeStr("hash")); } // namespace attr // ===== Shared attribute values ============================================= diff --git a/src/libxrpl/beast/insight/StatsDCollector.cpp b/src/libxrpl/beast/insight/StatsDCollector.cpp index d11e85830f..6116987447 100644 --- a/src/libxrpl/beast/insight/StatsDCollector.cpp +++ b/src/libxrpl/beast/insight/StatsDCollector.cpp @@ -166,7 +166,7 @@ private: std::string m_name; GaugeImpl::value_type m_last_value{0}; GaugeImpl::value_type m_value{0}; - bool m_dirty{false}; + bool m_dirty{true}; }; //------------------------------------------------------------------------------ @@ -583,6 +583,9 @@ StatsDEventImpl::do_notify(EventImpl::value_type const& value) StatsDGaugeImpl::StatsDGaugeImpl(std::string name, std::shared_ptr const& impl) : m_impl(impl), m_name(std::move(name)) { + // Start dirty so the initial value (0) is emitted on the first flush. + // Without this, gauges whose value never changes from 0 would never + // appear in downstream metric stores (e.g. Prometheus via StatsD). m_impl->add(*this); } diff --git a/src/xrpld/app/consensus/RCLConsensus.cpp b/src/xrpld/app/consensus/RCLConsensus.cpp index 231f94a2a7..fad5971374 100644 --- a/src/xrpld/app/consensus/RCLConsensus.cpp +++ b/src/xrpld/app/consensus/RCLConsensus.cpp @@ -569,7 +569,8 @@ RCLConsensus::Adaptor::doAccept( static_cast( std::chrono::duration_cast(closeResolution).count())); doAcceptSpan.setAttribute( - telemetry::cons_span::attr::state, std::string(consensusFail ? "moved_on" : "finished")); + telemetry::cons_span::attr::consensusState, + std::string(consensusFail ? "moved_on" : "finished")); doAcceptSpan.setAttribute(telemetry::cons_span::attr::proposing, proposing); doAcceptSpan.setAttribute( telemetry::cons_span::attr::roundTimeMs, diff --git a/src/xrpld/app/ledger/detail/BuildLedger.cpp b/src/xrpld/app/ledger/detail/BuildLedger.cpp index 95f72bde15..8548b6a30b 100644 --- a/src/xrpld/app/ledger/detail/BuildLedger.cpp +++ b/src/xrpld/app/ledger/detail/BuildLedger.cpp @@ -82,7 +82,7 @@ buildLedgerImpl( built->header().seq < XRP_LEDGER_EARLIEST_FEES || built->read(keylet::fees()), "xrpl::buildLedgerImpl : valid ledger fees"); built->setAccepted(closeTime, closeResolution, closeTimeCorrect); - buildSpan.setAttribute(ledger_span::attr::seq, static_cast(built->header().seq)); + buildSpan.setAttribute(ledger_span::attr::ledgerSeq, static_cast(built->header().seq)); buildSpan.setAttribute( ledger_span::attr::closeTime, static_cast(closeTime.time_since_epoch().count())); buildSpan.setAttribute(ledger_span::attr::closeTimeCorrect, closeTimeCorrect); diff --git a/src/xrpld/app/ledger/detail/LedgerMaster.cpp b/src/xrpld/app/ledger/detail/LedgerMaster.cpp index a8fb0dfa66..545793a816 100644 --- a/src/xrpld/app/ledger/detail/LedgerMaster.cpp +++ b/src/xrpld/app/ledger/detail/LedgerMaster.cpp @@ -460,7 +460,7 @@ LedgerMaster::storeLedger(std::shared_ptr ledger) { using namespace telemetry; auto span = SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::store); - span.setAttribute(ledger_span::attr::seq, static_cast(ledger->header().seq)); + span.setAttribute(ledger_span::attr::ledgerSeq, static_cast(ledger->header().seq)); bool const validated = ledger->header().validated; // Returns true if we already had the ledger @@ -980,7 +980,7 @@ LedgerMaster::checkAccept(std::shared_ptr const& ledger) using namespace telemetry; auto valSpan = SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::validate); - valSpan.setAttribute(ledger_span::attr::seq, static_cast(ledger->header().seq)); + valSpan.setAttribute(ledger_span::attr::ledgerSeq, static_cast(ledger->header().seq)); valSpan.setAttribute(ledger_span::attr::validations, static_cast(tvc)); JLOG(m_journal.info()) << "Advancing accepted ledger to " << ledger->header().seq diff --git a/src/xrpld/app/ledger/detail/LedgerSpanNames.h b/src/xrpld/app/ledger/detail/LedgerSpanNames.h index 4d24a60b2e..a359e5d2c7 100644 --- a/src/xrpld/app/ledger/detail/LedgerSpanNames.h +++ b/src/xrpld/app/ledger/detail/LedgerSpanNames.h @@ -29,22 +29,17 @@ inline constexpr auto apply = makeStr("apply"); // ===== Attribute keys ======================================================== namespace attr { -inline constexpr auto xrplLedger = join(seg::xrpl, seg::ledger); +/// Canonical shared constants (defined in SpanNames.h). +using ::xrpl::telemetry::attr::closeResolutionMs; +using ::xrpl::telemetry::attr::closeTime; +using ::xrpl::telemetry::attr::closeTimeCorrect; +using ::xrpl::telemetry::attr::ledgerHash; +using ::xrpl::telemetry::attr::ledgerSeq; -/// "xrpl.ledger.seq" -inline constexpr auto seq = join(xrplLedger, makeStr("seq")); -/// "xrpl.ledger.close_time" -inline constexpr auto closeTime = join(xrplLedger, makeStr("close_time")); -/// "xrpl.ledger.close_time_correct" -inline constexpr auto closeTimeCorrect = join(xrplLedger, makeStr("close_time_correct")); -/// "xrpl.ledger.close_resolution_ms" -inline constexpr auto closeResolutionMs = join(xrplLedger, makeStr("close_resolution_ms")); -/// "xrpl.ledger.tx_count" -inline constexpr auto txCount = join(xrplLedger, makeStr("tx_count")); -/// "xrpl.ledger.tx_failed" -inline constexpr auto txFailed = join(xrplLedger, makeStr("tx_failed")); -/// "xrpl.ledger.validations" -inline constexpr auto validations = join(xrplLedger, makeStr("validations")); +/// Domain-owned bare attrs. +inline constexpr auto txCount = makeStr("tx_count"); +inline constexpr auto txFailed = makeStr("tx_failed"); +inline constexpr auto validations = makeStr("validations"); } // namespace attr } // namespace xrpl::telemetry::ledger_span diff --git a/src/xrpld/app/main/Application.cpp b/src/xrpld/app/main/Application.cpp index 81531a0697..8d4c620fae 100644 --- a/src/xrpld/app/main/Application.cpp +++ b/src/xrpld/app/main/Application.cpp @@ -150,6 +150,7 @@ private: beast::Journal m_journal; beast::io_latency_probe m_probe; std::atomic lastSample_; + std::atomic firstSample_; public: io_latency_sampler( @@ -157,7 +158,7 @@ private: beast::Journal journal, std::chrono::milliseconds interval, boost::asio::io_context& ios) - : m_event(std::move(ev)), m_journal(journal), m_probe(interval, ios) + : m_event(std::move(ev)), m_journal(journal), m_probe(interval, ios), firstSample_(true) { } @@ -176,7 +177,10 @@ private: lastSample_ = lastSample; - if (lastSample >= 10ms) + // Always emit the first sample so the metric is registered in + // downstream stores (Prometheus via StatsD). After that, only + // report latency >= 10 ms to avoid flooding with sub-ms values. + if (firstSample_.exchange(false) || lastSample >= 10ms) m_event.notify(lastSample); if (lastSample >= 500ms) { diff --git a/src/xrpld/app/main/GrpcSpanNames.h b/src/xrpld/app/main/GrpcSpanNames.h index 869d5628aa..76166e86cb 100644 --- a/src/xrpld/app/main/GrpcSpanNames.h +++ b/src/xrpld/app/main/GrpcSpanNames.h @@ -11,7 +11,7 @@ * +-------------------------------------------------------+ * | grpc.request | * | CallData::process(coro) | - * | attrs: method, role, status | + * | attrs: method, grpc_role, grpc_status | * +-------------------------------------------------------+ * * Unlike the HTTP/WS RPC path, gRPC has a flat single-span structure @@ -38,14 +38,12 @@ inline constexpr auto request = makeStr("request"); // ===== Attribute keys ====================================================== namespace attr { -inline constexpr auto xrplGrpc = join(seg::xrpl, makeStr("grpc")); - -/// "xrpl.grpc.method" -inline constexpr auto method = join(xrplGrpc, makeStr("method")); -/// "xrpl.grpc.role" -inline constexpr auto role = join(xrplGrpc, makeStr("role")); -/// "xrpl.grpc.status" -inline constexpr auto status = join(xrplGrpc, makeStr("status")); +/// "method" — gRPC method name (e.g. GetLedger). +inline constexpr auto method = makeStr("method"); +/// "grpc_role" — Domain-qualified: collides with rpc_role. +inline constexpr auto grpcRole = makeStr("grpc_role"); +/// "grpc_status" — Domain-qualified: avoids OTel reserved span status. +inline constexpr auto grpcStatus = makeStr("grpc_status"); } // namespace attr // ===== Attribute values ==================================================== diff --git a/src/xrpld/app/misc/NetworkOPs.cpp b/src/xrpld/app/misc/NetworkOPs.cpp index 9e93b8fe51..967cfcbf83 100644 --- a/src/xrpld/app/misc/NetworkOPs.cpp +++ b/src/xrpld/app/misc/NetworkOPs.cpp @@ -1326,7 +1326,7 @@ NetworkOPsImp::processTransaction( { using namespace telemetry; auto span = std::make_shared(txProcessSpan(transaction->getID())); - span->setAttribute(tx_span::attr::hash, to_string(transaction->getID()).c_str()); + span->setAttribute(tx_span::attr::txHash, to_string(transaction->getID()).c_str()); span->setAttribute(tx_span::attr::local, bLocal); auto ev = m_job_queue.makeLoadEvent(jtTXN_PROC, "ProcessTXN"); diff --git a/src/xrpld/app/misc/TxSpanNames.h b/src/xrpld/app/misc/TxSpanNames.h index 091213959e..964246fcd6 100644 --- a/src/xrpld/app/misc/TxSpanNames.h +++ b/src/xrpld/app/misc/TxSpanNames.h @@ -41,25 +41,20 @@ inline constexpr auto process = join(prefix::tx, op::process); // ===== Attribute keys ====================================================== namespace attr { -inline constexpr auto xrplTx = join(seg::xrpl, seg::tx); +/// Canonical shared constants (defined in SpanNames.h). +using ::xrpl::telemetry::attr::peerId; +using ::xrpl::telemetry::attr::txHash; -/// "xrpl.tx.hash" -inline constexpr auto hash = join(xrplTx, makeStr("hash")); -/// "xrpl.tx.local" -inline constexpr auto local = join(xrplTx, makeStr("local")); -/// "xrpl.tx.path" -inline constexpr auto path = join(xrplTx, makeStr("path")); -/// "xrpl.tx.suppressed" -inline constexpr auto suppressed = join(xrplTx, makeStr("suppressed")); -/// "xrpl.tx.status" -inline constexpr auto status = join(xrplTx, makeStr("status")); - -inline constexpr auto xrplPeer = join(seg::xrpl, seg::peer); - -/// "xrpl.peer.id" -inline constexpr auto peerId = join(xrplPeer, makeStr("id")); -/// "xrpl.peer.version" -inline constexpr auto peerVersion = join(xrplPeer, makeStr("version")); +/// "local" — whether tx originated locally. +inline constexpr auto local = makeStr("local"); +/// "path" — sync or async processing path. +inline constexpr auto path = makeStr("path"); +/// "suppressed" — whether tx was suppressed as duplicate. +inline constexpr auto suppressed = makeStr("suppressed"); +/// "tx_status" — domain-qualified (collides with rpc_status, txq_status). +inline constexpr auto txStatus = makeStr("tx_status"); +/// "peer_version" — version of peer that sent the tx. +inline constexpr auto peerVersion = makeStr("peer_version"); } // namespace attr // ===== Attribute values ==================================================== diff --git a/src/xrpld/app/misc/detail/TxQSpanNames.h b/src/xrpld/app/misc/detail/TxQSpanNames.h index 4f18e82ae7..4268a8f5b4 100644 --- a/src/xrpld/app/misc/detail/TxQSpanNames.h +++ b/src/xrpld/app/misc/detail/TxQSpanNames.h @@ -71,30 +71,28 @@ inline constexpr auto cleanup = makeStr("cleanup"); // ===== Attribute keys ====================================================== namespace attr { -inline constexpr auto xrplTxq = join(seg::xrpl, makeStr("txq")); +/// Canonical shared constants (defined in SpanNames.h). +using ::xrpl::telemetry::attr::ledgerSeq; +using ::xrpl::telemetry::attr::txHash; -/// "xrpl.txq.tx_hash" -inline constexpr auto txHash = join(xrplTxq, makeStr("tx_hash")); -/// "xrpl.txq.status" -inline constexpr auto status = join(xrplTxq, makeStr("status")); -/// "xrpl.txq.fee_level_paid" -inline constexpr auto feeLevelPaid = join(xrplTxq, makeStr("fee_level_paid")); -/// "xrpl.txq.required_fee_level" -inline constexpr auto requiredFeeLevel = join(xrplTxq, makeStr("required_fee_level")); -/// "xrpl.txq.queue_size" -inline constexpr auto queueSize = join(xrplTxq, makeStr("queue_size")); -/// "xrpl.txq.ledger_changed" -inline constexpr auto ledgerChanged = join(xrplTxq, makeStr("ledger_changed")); -/// "xrpl.txq.ledger_seq" -inline constexpr auto ledgerSeq = join(xrplTxq, makeStr("ledger_seq")); -/// "xrpl.txq.expired_count" -inline constexpr auto expiredCount = join(xrplTxq, makeStr("expired_count")); -/// "xrpl.txq.ter_code" -inline constexpr auto terCode = join(xrplTxq, makeStr("ter_code")); -/// "xrpl.txq.retries_remaining" -inline constexpr auto retriesRemaining = join(xrplTxq, makeStr("retries_remaining")); -/// "xrpl.txq.num_cleared" -inline constexpr auto numCleared = join(xrplTxq, makeStr("num_cleared")); +/// "txq_status" — domain-qualified (collides with tx_status, rpc_status). +inline constexpr auto txqStatus = makeStr("txq_status"); +/// "fee_level_paid" — fee level paid by queued tx. +inline constexpr auto feeLevelPaid = makeStr("fee_level_paid"); +/// "required_fee_level" — minimum fee level for inclusion. +inline constexpr auto requiredFeeLevel = makeStr("required_fee_level"); +/// "queue_size" — current TxQ depth. +inline constexpr auto queueSize = makeStr("queue_size"); +/// "ledger_changed" — whether ledger changed since last attempt. +inline constexpr auto ledgerChanged = makeStr("ledger_changed"); +/// "expired_count" — number of expired entries cleared. +inline constexpr auto expiredCount = makeStr("expired_count"); +/// "ter_code" — transaction engine result code. +inline constexpr auto terCode = makeStr("ter_code"); +/// "retries_remaining" — retries left before discard. +inline constexpr auto retriesRemaining = makeStr("retries_remaining"); +/// "num_cleared" — entries cleared in batch. +inline constexpr auto numCleared = makeStr("num_cleared"); } // namespace attr // ===== Attribute values ==================================================== diff --git a/src/xrpld/consensus/Consensus.h b/src/xrpld/consensus/Consensus.h index f6bb1ecb22..3c1df2414d 100644 --- a/src/xrpld/consensus/Consensus.h +++ b/src/xrpld/consensus/Consensus.h @@ -1812,7 +1812,7 @@ Consensus::haveConsensus(std::unique_ptr const& clog { stateStr = "expired"; } - span.setAttribute(cons_span::attr::result, stateStr); + span.setAttribute(cons_span::attr::consensusResult, stateStr); CLOG(clog) << "Consensus has been reached. "; // NOLINTEND(bugprone-unchecked-optional-access) diff --git a/src/xrpld/consensus/ConsensusSpanNames.h b/src/xrpld/consensus/ConsensusSpanNames.h index b3112af1ec..d10a48c86e 100644 --- a/src/xrpld/consensus/ConsensusSpanNames.h +++ b/src/xrpld/consensus/ConsensusSpanNames.h @@ -124,96 +124,51 @@ inline constexpr auto phaseOpen = join(seg::consensus, op::phaseOpen); // ===== Attribute keys ======================================================== namespace attr { -inline constexpr auto xrplConsensus = join(seg::xrpl, seg::consensus); +/// Canonical shared constants (defined in SpanNames.h). +using ::xrpl::telemetry::attr::closeResolutionMs; +using ::xrpl::telemetry::attr::closeTime; +using ::xrpl::telemetry::attr::closeTimeCorrect; +using ::xrpl::telemetry::attr::ledgerSeq; -/// "xrpl.consensus.ledger_id" -inline constexpr auto ledgerId = join(xrplConsensus, makeStr("ledger_id")); -/// "xrpl.consensus.ledger.seq" -inline constexpr auto ledgerSeq = join(join(xrplConsensus, makeStr("ledger")), makeStr("seq")); -/// "xrpl.consensus.mode" -inline constexpr auto mode = join(xrplConsensus, makeStr("mode")); -/// "xrpl.consensus.round" -inline constexpr auto round = join(xrplConsensus, makeStr("round")); -/// "xrpl.consensus.proposers" -inline constexpr auto proposers = join(xrplConsensus, makeStr("proposers")); -/// "xrpl.consensus.round_time_ms" -inline constexpr auto roundTimeMs = join(xrplConsensus, makeStr("round_time_ms")); -/// "xrpl.consensus.proposing" -inline constexpr auto proposing = join(xrplConsensus, makeStr("proposing")); -/// "xrpl.consensus.state" -inline constexpr auto state = join(xrplConsensus, makeStr("state")); +/// Kept qualified (rule 5 — bare name ambiguous across domains). +inline constexpr auto ledgerId = join(join(seg::xrpl, seg::consensus), makeStr("ledger_id")); +inline constexpr auto mode = join(join(seg::xrpl, seg::consensus), makeStr("mode")); +inline constexpr auto round = join(join(seg::xrpl, seg::consensus), makeStr("round")); +inline constexpr auto roundId = join(join(seg::xrpl, seg::consensus), makeStr("round_id")); -// Close time attributes -/// "xrpl.consensus.close_time" -inline constexpr auto closeTime = join(xrplConsensus, makeStr("close_time")); -/// "xrpl.consensus.close_time_correct" -inline constexpr auto closeTimeCorrect = join(xrplConsensus, makeStr("close_time_correct")); -/// "xrpl.consensus.close_resolution_ms" -inline constexpr auto closeResolutionMs = join(xrplConsensus, makeStr("close_resolution_ms")); -/// "xrpl.consensus.parent_close_time" -inline constexpr auto parentCloseTime = join(xrplConsensus, makeStr("parent_close_time")); -/// "xrpl.consensus.close_time_self" -inline constexpr auto closeTimeSelf = join(xrplConsensus, makeStr("close_time_self")); -/// "xrpl.consensus.close_time_vote_bins" -inline constexpr auto closeTimeVoteBins = join(xrplConsensus, makeStr("close_time_vote_bins")); -/// "xrpl.consensus.resolution_direction" -inline constexpr auto resolutionDirection = join(xrplConsensus, makeStr("resolution_direction")); +/// Domain-owned bare attrs. +inline constexpr auto proposers = makeStr("proposers"); +inline constexpr auto roundTimeMs = makeStr("round_time_ms"); +inline constexpr auto proposing = makeStr("proposing"); +/// "consensus_state" — domain-qualified (collides with other domains' state). +inline constexpr auto consensusState = makeStr("consensus_state"); +inline constexpr auto parentCloseTime = makeStr("parent_close_time"); +inline constexpr auto closeTimeSelf = makeStr("close_time_self"); +inline constexpr auto closeTimeVoteBins = makeStr("close_time_vote_bins"); +inline constexpr auto resolutionDirection = makeStr("resolution_direction"); +inline constexpr auto convergePercent = makeStr("converge_percent"); +inline constexpr auto establishCount = makeStr("establish_count"); +inline constexpr auto avalancheThreshold = makeStr("avalanche_threshold"); +inline constexpr auto closeTimeThreshold = makeStr("close_time_threshold"); +inline constexpr auto haveCloseTimeConsensus = makeStr("have_close_time_consensus"); +inline constexpr auto agreeCount = makeStr("agree_count"); +inline constexpr auto disagreeCount = makeStr("disagree_count"); +inline constexpr auto thresholdPercent = makeStr("threshold_percent"); +/// "consensus_result" — domain-qualified (collides with generic result). +inline constexpr auto consensusResult = makeStr("consensus_result"); +inline constexpr auto quorum = makeStr("quorum"); +inline constexpr auto traceStrategy = makeStr("trace_strategy"); +inline constexpr auto modeOld = makeStr("mode_old"); +inline constexpr auto modeNew = makeStr("mode_new"); -// Establish/convergence attributes -/// "xrpl.consensus.converge_percent" -inline constexpr auto convergePercent = join(xrplConsensus, makeStr("converge_percent")); -/// "xrpl.consensus.establish_count" -inline constexpr auto establishCount = join(xrplConsensus, makeStr("establish_count")); -// Avalanche threshold attributes -/// "xrpl.consensus.avalanche_threshold" -inline constexpr auto avalancheThreshold = join(xrplConsensus, makeStr("avalanche_threshold")); -/// "xrpl.consensus.close_time_threshold" -inline constexpr auto closeTimeThreshold = join(xrplConsensus, makeStr("close_time_threshold")); -/// "xrpl.consensus.have_close_time_consensus" -inline constexpr auto haveCloseTimeConsensus = - join(xrplConsensus, makeStr("have_close_time_consensus")); - -// Consensus check attributes -/// "xrpl.consensus.agree_count" -inline constexpr auto agreeCount = join(xrplConsensus, makeStr("agree_count")); -/// "xrpl.consensus.disagree_count" -inline constexpr auto disagreeCount = join(xrplConsensus, makeStr("disagree_count")); -/// "xrpl.consensus.threshold_percent" -inline constexpr auto thresholdPercent = join(xrplConsensus, makeStr("threshold_percent")); -/// "xrpl.consensus.result" -inline constexpr auto result = join(xrplConsensus, makeStr("result")); -/// "xrpl.consensus.quorum" -inline constexpr auto quorum = join(xrplConsensus, makeStr("quorum")); - -// Trace strategy attribute -/// "xrpl.consensus.trace_strategy" -inline constexpr auto traceStrategy = join(xrplConsensus, makeStr("trace_strategy")); -/// "xrpl.consensus.round_id" -inline constexpr auto roundId = join(xrplConsensus, makeStr("round_id")); - -// Mode change attributes -/// "xrpl.consensus.mode.old" -inline constexpr auto modeOld = join(join(xrplConsensus, makeStr("mode")), makeStr("old")); -/// "xrpl.consensus.mode.new" -inline constexpr auto modeNew = join(join(xrplConsensus, makeStr("mode")), makeStr("new")); - -// Dispute event attributes -/// "xrpl.tx.id" +/// Transaction/dispute attrs used in consensus accept spans. inline constexpr auto txId = join(join(seg::xrpl, seg::tx), makeStr("id")); -/// "xrpl.dispute.our_vote" -inline constexpr auto disputeOurVote = - join(join(seg::xrpl, makeStr("dispute")), makeStr("our_vote")); -/// "xrpl.dispute.yays" -inline constexpr auto disputeYays = join(join(seg::xrpl, makeStr("dispute")), makeStr("yays")); -/// "xrpl.dispute.nays" -inline constexpr auto disputeNays = join(join(seg::xrpl, makeStr("dispute")), makeStr("nays")); - -/// "xrpl.consensus.tx_count" -inline constexpr auto txCount = join(xrplConsensus, makeStr("tx_count")); -/// "xrpl.consensus.disputes_count" -inline constexpr auto disputesCount = join(xrplConsensus, makeStr("disputes_count")); -/// "xrpl.consensus.trusted" -inline constexpr auto trusted = join(xrplConsensus, makeStr("trusted")); +inline constexpr auto disputeOurVote = makeStr("dispute_our_vote"); +inline constexpr auto disputeYays = makeStr("dispute_yays"); +inline constexpr auto disputeNays = makeStr("dispute_nays"); +inline constexpr auto txCount = makeStr("tx_count"); +inline constexpr auto disputesCount = makeStr("disputes_count"); +inline constexpr auto trusted = makeStr("trusted"); } // namespace attr // ===== Event names =========================================================== diff --git a/src/xrpld/overlay/detail/PeerImp.cpp b/src/xrpld/overlay/detail/PeerImp.cpp index 888aa1b8cc..8f74dbe511 100644 --- a/src/xrpld/overlay/detail/PeerImp.cpp +++ b/src/xrpld/overlay/detail/PeerImp.cpp @@ -1447,10 +1447,15 @@ PeerImp::handleTransaction( using namespace telemetry; auto span = std::make_shared(txReceiveSpan(txID, *m)); - span->setAttribute(tx_span::attr::hash, to_string(txID).c_str()); + span->setAttribute(tx_span::attr::txHash, to_string(txID).c_str()); span->setAttribute(tx_span::attr::peerId, static_cast(id_)); if (auto const version = getVersion(); !version.empty()) span->setAttribute(tx_span::attr::peerVersion, version.c_str()); + // Set defaults for conditional attributes so they are always present + // on the span. The suppressed path overrides these when the + // transaction has already been seen via HashRouter. + span->setAttribute(tx_span::attr::suppressed, false); + span->setAttribute(tx_span::attr::txStatus, "new"); // Charge strongly for attempting to relay a txn with tfInnerBatchTxn // LCOV_EXCL_START @@ -1488,7 +1493,7 @@ PeerImp::handleTransaction( // we have seen this transaction recently if (any(flags & HashRouterFlags::BAD)) { - span->setAttribute(tx_span::attr::status, tx_span::val::knownBad); + span->setAttribute(tx_span::attr::txStatus, tx_span::val::knownBad); fee_.update(Resource::feeUselessData, "known bad"); JLOG(p_journal_.debug()) << "Ignoring known bad tx " << txID; } @@ -1870,7 +1875,7 @@ PeerImp::onMessage(std::shared_ptr const& m) { using namespace telemetry; auto span = SpanGuard::span(TraceCategory::Peer, seg::peer, peer_span::op::proposalReceive); - span.setAttribute(peer_span::attr::id, static_cast(id_)); + span.setAttribute(peer_span::attr::peerId, static_cast(id_)); protocol::TMProposeSet const& set = *m; @@ -2479,7 +2484,7 @@ PeerImp::onMessage(std::shared_ptr const& m) using namespace telemetry; auto valSpan = SpanGuard::span(TraceCategory::Peer, seg::peer, peer_span::op::validationReceive); - valSpan.setAttribute(peer_span::attr::id, static_cast(id_)); + valSpan.setAttribute(peer_span::attr::peerId, static_cast(id_)); if (m->validation().size() < 50) { @@ -2503,8 +2508,7 @@ PeerImp::onMessage(std::shared_ptr const& m) false); val->setSeen(closeTime); } - valSpan.setAttribute( - peer_span::attr::validationLedgerHash, to_string(val->getLedgerHash()).c_str()); + valSpan.setAttribute(peer_span::attr::ledgerHash, to_string(val->getLedgerHash()).c_str()); valSpan.setAttribute(peer_span::attr::validationFull, val->isFull()); if (!isCurrent( diff --git a/src/xrpld/overlay/detail/PeerSpanNames.h b/src/xrpld/overlay/detail/PeerSpanNames.h index 9697ea3fa4..fd2081f778 100644 --- a/src/xrpld/overlay/detail/PeerSpanNames.h +++ b/src/xrpld/overlay/detail/PeerSpanNames.h @@ -25,22 +25,14 @@ inline constexpr auto validationReceive = makeStr("validation.receive"); // ===== Attribute keys ======================================================== namespace attr { -inline constexpr auto xrplPeer = join(seg::xrpl, seg::peer); +/// Canonical shared constants (defined in SpanNames.h). +using ::xrpl::telemetry::attr::ledgerHash; +using ::xrpl::telemetry::attr::peerId; -/// "xrpl.peer.id" -inline constexpr auto id = join(xrplPeer, makeStr("id")); -/// "xrpl.peer.proposal.trusted" -inline constexpr auto proposalTrusted = - join(join(xrplPeer, makeStr("proposal")), makeStr("trusted")); - -/// "xrpl.peer.validation.ledger_hash" -inline constexpr auto validationLedgerHash = - join(join(xrplPeer, makeStr("validation")), makeStr("ledger_hash")); -/// "xrpl.peer.validation.full" -inline constexpr auto validationFull = join(join(xrplPeer, makeStr("validation")), makeStr("full")); -/// "xrpl.peer.validation.trusted" -inline constexpr auto validationTrusted = - join(join(xrplPeer, makeStr("validation")), makeStr("trusted")); +/// Domain-owned bare attrs. +inline constexpr auto proposalTrusted = makeStr("proposal_trusted"); +inline constexpr auto validationFull = makeStr("validation_full"); +inline constexpr auto validationTrusted = makeStr("validation_trusted"); } // namespace attr } // namespace xrpl::telemetry::peer_span diff --git a/src/xrpld/rpc/detail/PathFindSpanNames.h b/src/xrpld/rpc/detail/PathFindSpanNames.h index 40c9509cca..af61340b6f 100644 --- a/src/xrpld/rpc/detail/PathFindSpanNames.h +++ b/src/xrpld/rpc/detail/PathFindSpanNames.h @@ -63,24 +63,24 @@ inline constexpr auto rank = makeStr("rank"); // ===== Attribute keys ====================================================== namespace attr { -inline constexpr auto xrplPathfind = join(seg::xrpl, makeStr("pathfind")); - -/// "xrpl.pathfind.source_account" -inline constexpr auto sourceAccount = join(xrplPathfind, makeStr("source_account")); -/// "xrpl.pathfind.dest_account" -inline constexpr auto destAccount = join(xrplPathfind, makeStr("dest_account")); -/// "xrpl.pathfind.fast" -inline constexpr auto fast = join(xrplPathfind, makeStr("fast")); -/// "xrpl.pathfind.search_level" -inline constexpr auto searchLevel = join(xrplPathfind, makeStr("search_level")); -/// "xrpl.pathfind.num_complete_paths" -inline constexpr auto numCompletePaths = join(xrplPathfind, makeStr("num_complete_paths")); -/// "xrpl.pathfind.num_paths" -inline constexpr auto numPaths = join(xrplPathfind, makeStr("num_paths")); -/// "xrpl.pathfind.num_requests" -inline constexpr auto numRequests = join(xrplPathfind, makeStr("num_requests")); -/// "xrpl.pathfind.ledger_index" -inline constexpr auto ledgerIndex = join(xrplPathfind, makeStr("ledger_index")); +/// "source_account" — originating account for path search. +inline constexpr auto sourceAccount = makeStr("source_account"); +/// "dest_account" — destination account. +inline constexpr auto destAccount = makeStr("dest_account"); +/// "fast" — whether fast pathfinding mode enabled. +inline constexpr auto fast = makeStr("fast"); +/// "search_level" — depth of graph exploration. +inline constexpr auto searchLevel = makeStr("search_level"); +/// "num_complete_paths" — complete paths found. +inline constexpr auto numCompletePaths = makeStr("num_complete_paths"); +/// "num_paths" — total paths returned. +inline constexpr auto numPaths = makeStr("num_paths"); +/// "num_requests" — active path requests. +inline constexpr auto numRequests = makeStr("num_requests"); +/// "xrpl.pathfind.ledger_index" — kept qualified (rule 5): pathfind target +/// ledger is distinct from xrpl.ledger.seq. +inline constexpr auto ledgerIndex = + join(join(seg::xrpl, makeStr("pathfind")), makeStr("ledger_index")); } // namespace attr } // namespace xrpl::telemetry::pathfind_span diff --git a/src/xrpld/rpc/detail/RPCHandler.cpp b/src/xrpld/rpc/detail/RPCHandler.cpp index b7968ab070..210e0e25fe 100644 --- a/src/xrpld/rpc/detail/RPCHandler.cpp +++ b/src/xrpld/rpc/detail/RPCHandler.cpp @@ -168,11 +168,9 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object& span.setAttribute(rpc_span::attr::command, name.c_str()); span.setAttribute(rpc_span::attr::version, static_cast(context.apiVersion)); span.setAttribute( - rpc_span::attr::role, + rpc_span::attr::rpcRole, context.role == Role::ADMIN ? std::string_view(rpc_span::val::admin) : std::string_view(rpc_span::val::user)); - span.setAttribute(attr::nodeAmendmentBlocked, context.app.getOPs().isAmendmentBlocked()); - span.setAttribute(attr::nodeServerState, context.app.getOPs().strOperatingMode()); static std::atomic requestId{0}; auto& perfLog = context.app.getPerfLog(); @@ -189,7 +187,7 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object& JLOG(context.j.debug()) << "RPC call " << name << " completed in " << ((end - start).count() / 1000000000.0) << "seconds"; perfLog.rpcFinish(name, curId); - span.setAttribute(rpc_span::attr::status, rpc_span::val::success); + span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success); return ret; } catch (std::exception& e) @@ -197,7 +195,7 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object& perfLog.rpcError(name, curId); JLOG(context.j.info()) << "Caught throw: " << e.what(); span.recordException(e); - span.setAttribute(rpc_span::attr::status, rpc_span::val::error); + span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::error); if (context.loadType == Resource::feeReferenceRPC) context.loadType = Resource::feeExceptionRPC; diff --git a/src/xrpld/rpc/detail/RpcSpanNames.h b/src/xrpld/rpc/detail/RpcSpanNames.h index 76f1c2be75..2be5aa195c 100644 --- a/src/xrpld/rpc/detail/RpcSpanNames.h +++ b/src/xrpld/rpc/detail/RpcSpanNames.h @@ -14,8 +14,99 @@ * auto span = SpanGuard::span( * TraceCategory::Rpc, rpc_span::prefix::command, "submit"); * span.setAttribute(rpc_span::attr::command, "submit"); - * span.setAttribute(rpc_span::attr::status, rpc_span::val::success); + * span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success); * @endcode + * + * Span hierarchy (automatic nesting via OTel thread-local context): + * + * HTTP JSON-RPC path (single request): + * + * +-------------------------------------------------------+ + * | rpc.http_request | + * | ServerHandler::processSession(Session) | + * | | + * | +--------------------------------------------------+ | + * | | rpc.process | | + * | | ServerHandler::processRequest() | | + * | | | | + * | | +---------------------------------------------+ | | + * | | | rpc.command.{name} | | | + * | | | RPC::callMethod() | | | + * | | | attrs: command, version, rpc_role, rpc_status | | | + * | | +---------------------------------------------+ | | + * | +--------------------------------------------------+ | + * +-------------------------------------------------------+ + * + * HTTP batch path (multiple commands per request): + * + * +-------------------------------------------------------+ + * | rpc.http_request | + * | | + * | +--------------------------------------------------+ | + * | | rpc.process | | + * | | | | + * | | +------------------+ +------------------+ | | + * | | | rpc.command.{a} | | rpc.command.{b} | ... | | + * | | +------------------+ +------------------+ | | + * | +--------------------------------------------------+ | + * +-------------------------------------------------------+ + * + * WebSocket path: + * + * +-------------------------------------------------------+ + * | rpc.ws_message | + * | ServerHandler::processSession(WSSession) | + * | | + * | +--------------------------------------------------+ | + * | | rpc.command.{name} | | + * | | RPC::callMethod() | | + * | | attrs: command, version, rpc_role, rpc_status | | + * | +--------------------------------------------------+ | + * +-------------------------------------------------------+ + * + * WebSocket error paths: + * + * +-------------------------------------------------------+ + * | rpc.ws_message (error: invalid_json) | + * | ServerHandler::onWSMessage() — parse failure | + * +-------------------------------------------------------+ + * + * +-------------------------------------------------------+ + * | rpc.ws_upgrade | + * | ServerHandler::onHandoff() — upgrade try/catch | + * +-------------------------------------------------------+ + * + * Command dispatch error path: + * + * +-------------------------------------------------------+ + * | rpc.command.{name} (error: too_busy/unknown/etc) | + * | RPC::doCommand() — fillHandler() rejection | + * +-------------------------------------------------------+ + * + * gRPC path (see GrpcSpanNames.h for constants): + * + * +-------------------------------------------------------+ + * | grpc.request | + * | CallData::process(coro) | + * | attrs: method, grpc_status | + * +-------------------------------------------------------+ + * + * Covered paths: + * - HTTP JSON-RPC (single and batch requests) + * - WebSocket RPC commands + * - WebSocket message parse errors (invalid JSON, oversized) + * - WebSocket upgrade failures (protocol handshake errors) + * - Admin CLI (connects via HTTP internally) + * - Command dispatch rejections (unknown cmd, too busy, no perm) + * - gRPC endpoints (GetLedger, GetLedgerData, GetLedgerDiff, + * GetLedgerEntry) + * - Command execution: timing, success/failure, exceptions + * - Per-command attributes: name, API version, rpc_role, rpc_status + * + * Known gaps (not yet instrumented): + * - Early validation errors in processRequest() before rpc.process + * span (malformed JSON, auth failures, oversized requests) + * - Subscription push notifications (server-initiated, not RPC) */ #include @@ -43,18 +134,16 @@ inline constexpr auto process = makeStr("process"); // ===== Attribute keys ====================================================== namespace attr { -inline constexpr auto xrplRpc = join(seg::xrpl, seg::rpc); - -/// "xrpl.rpc.command" -inline constexpr auto command = join(xrplRpc, makeStr("command")); -/// "xrpl.rpc.version" -inline constexpr auto version = join(xrplRpc, makeStr("version")); -/// "xrpl.rpc.role" -inline constexpr auto role = join(xrplRpc, makeStr("role")); -/// "xrpl.rpc.status" -inline constexpr auto status = join(xrplRpc, makeStr("status")); -/// "xrpl.rpc.payload_size" -inline constexpr auto payloadSize = join(xrplRpc, makeStr("payload_size")); +/// "command" — RPC method name. +inline constexpr auto command = makeStr("command"); +/// "version" — api_version per request. +inline constexpr auto version = makeStr("version"); +/// "rpc_role" — admin|user. Domain-qualified: collides with grpc_role. +inline constexpr auto rpcRole = makeStr("rpc_role"); +/// "rpc_status" — success|error. Domain-qualified: avoids OTel reserved span status. +inline constexpr auto rpcStatus = makeStr("rpc_status"); +/// "request_payload_size" — bytes of inbound request payload. +inline constexpr auto requestPayloadSize = makeStr("request_payload_size"); } // namespace attr // ===== Attribute values ==================================================== diff --git a/src/xrpld/rpc/detail/ServerHandler.cpp b/src/xrpld/rpc/detail/ServerHandler.cpp index 85454e4a29..a181ec56cd 100644 --- a/src/xrpld/rpc/detail/ServerHandler.cpp +++ b/src/xrpld/rpc/detail/ServerHandler.cpp @@ -513,7 +513,7 @@ ServerHandler::processSession( JLOG(m_journal.error()) << "Exception while processing WS: " << ex.what() << "\n" << "Input JSON: " << Json::Compact{Json::Value{jv}}; span.recordException(ex); - span.setAttribute(rpc_span::attr::status, rpc_span::val::error); + span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::error); // LCOV_EXCL_STOP } @@ -904,7 +904,7 @@ ServerHandler::processRequest( << "Internal error : " << ex.what() << " when processing request: " << Json::Compact{Json::Value{params}}; span.recordException(ex); - span.setAttribute(rpc_span::attr::status, rpc_span::val::error); + span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::error); // LCOV_EXCL_STOP }