feat(telemetry): add dashboard panels for new T3 metrics

Visualise the metrics added in this series:
- consensus-health: "Ledger History Mismatch Rate by Reason"
  (xrpld_ledger_history_mismatch_total by reason — fork diagnostics)
- fee-market: "Queue Abandonment Rate (Expired)" and "Queue Admission
  Rejections (Dropped)" (xrpld_txq_expired_total / dropped_total)
- peer-network: "Reduce-Relay Peer Selection" and "Reduce-Relay Missing-Tx
  Frequency" (xrpld_reduce_relay_metrics)
- system-node-health: "Ledger Acquire Duration" and "Ledger Acquire Rate by
  Outcome" (ledger.acquire span)

otel-collector-config.yaml: add outcome and acquire_reason spanmetrics
dimensions so the ledger.acquire outcome breakdown populates.

All panels follow the existing template: $node filter, exported_instance in
legends, Title Case, axis labels.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-06-04 16:16:55 +01:00
parent 6205199dc7
commit 7d8e908879
5 changed files with 297 additions and 0 deletions

View File

@@ -965,6 +965,45 @@
},
"overrides": []
}
},
{
"title": "Ledger History Mismatch Rate by Reason",
"description": "Rate of built-vs-validated ledger mismatches broken down by reason (prior_ledger, close_time, consensus_txset, same_txset_diff_result, different_txset, unknown). Answers WHY the node forked \u2014 Byzantine close-time disagreement vs sync drift vs tx-processing difference.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 96
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (reason, exported_instance) (rate(xrpld_ledger_history_mismatch_total{exported_instance=~\"$node\"}[5m]))",
"legendFormat": "{{reason}} [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Mismatches / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
}
],
"schemaVersion": 39,

View File

@@ -150,6 +150,98 @@
},
"overrides": []
}
},
{
"title": "Reduce-Relay Peer Selection",
"description": "Transaction reduce-relay efficiency: peers selected as relay sources vs suppressed, plus peers with the feature disabled. A high suppressed:selected ratio proves reduce-relay is saving bandwidth; a high not_enabled count means stale peers force full relay.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "xrpld_reduce_relay_metrics{metric=\"selected_peers\",exported_instance=~\"$node\"}",
"legendFormat": "Selected [{{exported_instance}}]"
},
{
"datasource": {
"type": "prometheus"
},
"expr": "xrpld_reduce_relay_metrics{metric=\"suppressed_peers\",exported_instance=~\"$node\"}",
"legendFormat": "Suppressed [{{exported_instance}}]"
},
{
"datasource": {
"type": "prometheus"
},
"expr": "xrpld_reduce_relay_metrics{metric=\"not_enabled_peers\",exported_instance=~\"$node\"}",
"legendFormat": "Not Enabled [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"custom": {
"axisLabel": "Peer Count",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "Reduce-Relay Missing-Tx Frequency",
"description": "Frequency of on-demand transaction fetches triggered when a peer is missing a relayed transaction. A rising value means the suppression is too aggressive and the on-demand fetch path is growing.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "xrpld_reduce_relay_metrics{metric=\"missing_tx_freq\",exported_instance=~\"$node\"}",
"legendFormat": "Missing Tx Freq [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"custom": {
"axisLabel": "Frequency",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
}
],
"schemaVersion": 39,

View File

@@ -2051,6 +2051,91 @@
},
"overrides": []
}
},
{
"title": "Ledger Acquire Duration (Inbound Fetch)",
"description": "p95/p50 duration of the ledger.acquire span (InboundLedger): how long it takes to fetch a missing ledger from peers. A spike signals the node is falling behind or recovering from a fork. Populated under back-fill / sync activity.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 126
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))",
"legendFormat": "P95 Acquire [{{exported_instance}}]"
},
{
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))",
"legendFormat": "P50 Acquire [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ms",
"custom": {
"axisLabel": "Duration (ms)",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "Ledger Acquire Rate by Outcome",
"description": "Rate of completed ledger.acquire spans broken down by outcome (complete / failed). A rising failed rate indicates the node cannot fetch needed ledgers from its peers. Requires the outcome spanmetrics dimension.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 126
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (outcome, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m]))",
"legendFormat": "{{outcome}} [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Acquisitions / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
}
],
"schemaVersion": 39,

View File

@@ -375,6 +375,84 @@
},
"overrides": []
}
},
{
"title": "Queue Abandonment Rate (Expired)",
"description": "Rate of transactions expired out of the queue (LastLedgerSequence passed). Rising expiry means submitters under-bid the escalating fee and gave up \u2014 a demand-frustration signal.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (exported_instance) (rate(xrpld_txq_expired_total{exported_instance=~\"$node\"}[5m]))",
"legendFormat": "Expired / Sec [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Expired / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "Queue Admission Rejections (Dropped)",
"description": "Rate of transactions refused admission to the queue, by reason. queue_full means the queue is at capacity \u2014 admission-control backpressure distinct from expiry and from job-queue overflow.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 32
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (reason, exported_instance) (rate(xrpld_txq_dropped_total{exported_instance=~\"$node\"}[5m]))",
"legendFormat": "{{reason}} [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Dropped / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
}
],
"schemaVersion": 39,

View File

@@ -106,6 +106,9 @@ connectors:
- name: method
- name: grpc_role
- name: grpc_status
# ledger.acquire dimensions (bounded: outcome, acquire reason).
- name: outcome
- name: acquire_reason
exporters:
debug: