fix(telemetry): align consensus_mode spanmetrics label with emitted attribute

The spanmetrics connector dimension was `xrpl.consensus.mode`, but the code
emits the span attribute under the bare key `consensus_mode` (matching every
other dimension after the Phase 6 rename). The mismatch left the
`xrpl_consensus_mode` Prometheus label empty, so the Consensus Health
"Consensus Mode Over Time" panel and the `$consensus_mode` template variable
(which filters every panel) matched no live series.

- otel-collector-config.yaml: dimension `xrpl.consensus.mode` -> `consensus_mode`
- consensus-health.json: 11 label refs `xrpl_consensus_mode` -> `consensus_mode`
  (the `$consensus_mode` Grafana variable name is unchanged)
- telemetry-runbook.md: refresh the stale spanmetrics label table to the bare
  names actually emitted (command/rpc_status/consensus_mode/local/
  proposal_trusted/validation_trusted), fix dotted->bare attribute names in
  span tables and TraceQL examples (tx_hash, ledger_seq, consensus_round_id,
  consensus_ledger_id, consensus_round, tx_id event attr), correct the
  consensus_round_id query to int (not quoted string), and fix the
  load_type value query ("exception_rpc" -> "exceptioned RPC").

Verified against the live stack: Tempo span tags confirm bare attribute keys
(consensus_mode, ledger_seq, tx_hash, ...); the populated xrpl_consensus_mode
series in Prometheus is stale retained data from an older build.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-06-04 15:29:45 +01:00
parent 194f5b8af8
commit 4174aef07b
3 changed files with 105 additions and 105 deletions

View File

@@ -29,14 +29,14 @@
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
"legendFormat": "P95 Round Duration [{{exported_instance}}]"
},
{
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
"legendFormat": "P50 Round Duration [{{exported_instance}}]"
}
],
@@ -75,7 +75,7 @@
"datasource": {
"type": "prometheus"
},
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.proposal.send\"}[5m]))",
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.proposal.send\"}[5m]))",
"legendFormat": "Proposals / Sec [{{exported_instance}}]"
}
],
@@ -114,7 +114,7 @@
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m])))",
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m])))",
"legendFormat": "P95 Close Duration [{{exported_instance}}]"
}
],
@@ -153,7 +153,7 @@
"datasource": {
"type": "prometheus"
},
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.validation.send\"}[5m]))",
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.validation.send\"}[5m]))",
"legendFormat": "Validations / Sec [{{exported_instance}}]"
}
],
@@ -179,14 +179,14 @@
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
"legendFormat": "P95 Apply Duration [{{exported_instance}}]"
},
{
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
"legendFormat": "P50 Apply Duration [{{exported_instance}}]"
}
],
@@ -219,7 +219,7 @@
"datasource": {
"type": "prometheus"
},
"expr": "sum by (close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", xrpl_consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))",
"expr": "sum by (close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))",
"legendFormat": "Close Time Correct={{close_time_correct}} [{{exported_instance}}]"
}
],
@@ -258,8 +258,8 @@
"datasource": {
"type": "prometheus"
},
"expr": "sum by (xrpl_consensus_mode, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m]))",
"legendFormat": "{{xrpl_consensus_mode}} [{{exported_instance}}]"
"expr": "sum by (consensus_mode, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m]))",
"legendFormat": "{{consensus_mode}} [{{exported_instance}}]"
}
],
"fieldConfig": {
@@ -773,7 +773,7 @@
"label": "Consensus Mode",
"description": "Filter by consensus mode (Proposing, Observing, Wrong Ledger, Switched Ledger)",
"type": "query",
"query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, xrpl_consensus_mode)",
"query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, consensus_mode)",
"datasource": {
"type": "prometheus",
"uid": "prometheus"

View File

@@ -51,7 +51,7 @@ connectors:
dimensions:
- name: command
- name: rpc_status
- name: xrpl.consensus.mode
- name: consensus_mode
- name: close_time_correct
- name: local
- name: suppressed