mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-04 17:27:00 +00:00
feat(telemetry): add dashboard panels for new T3 metrics
Visualise the metrics added in this series: - consensus-health: "Ledger History Mismatch Rate by Reason" (xrpld_ledger_history_mismatch_total by reason — fork diagnostics) - fee-market: "Queue Abandonment Rate (Expired)" and "Queue Admission Rejections (Dropped)" (xrpld_txq_expired_total / dropped_total) - peer-network: "Reduce-Relay Peer Selection" and "Reduce-Relay Missing-Tx Frequency" (xrpld_reduce_relay_metrics) - system-node-health: "Ledger Acquire Duration" and "Ledger Acquire Rate by Outcome" (ledger.acquire span) otel-collector-config.yaml: add outcome and acquire_reason spanmetrics dimensions so the ledger.acquire outcome breakdown populates. All panels follow the existing template: $node filter, exported_instance in legends, Title Case, axis labels. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -965,6 +965,45 @@
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Ledger History Mismatch Rate by Reason",
|
||||
"description": "Rate of built-vs-validated ledger mismatches broken down by reason (prior_ledger, close_time, consensus_txset, same_txset_diff_result, different_txset, unknown). Answers WHY the node forked \u2014 Byzantine close-time disagreement vs sync drift vs tx-processing difference.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 96
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (reason, exported_instance) (rate(xrpld_ledger_history_mismatch_total{exported_instance=~\"$node\"}[5m]))",
|
||||
"legendFormat": "{{reason}} [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Mismatches / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
|
||||
@@ -150,6 +150,98 @@
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Reduce-Relay Peer Selection",
|
||||
"description": "Transaction reduce-relay efficiency: peers selected as relay sources vs suppressed, plus peers with the feature disabled. A high suppressed:selected ratio proves reduce-relay is saving bandwidth; a high not_enabled count means stale peers force full relay.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "xrpld_reduce_relay_metrics{metric=\"selected_peers\",exported_instance=~\"$node\"}",
|
||||
"legendFormat": "Selected [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "xrpld_reduce_relay_metrics{metric=\"suppressed_peers\",exported_instance=~\"$node\"}",
|
||||
"legendFormat": "Suppressed [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "xrpld_reduce_relay_metrics{metric=\"not_enabled_peers\",exported_instance=~\"$node\"}",
|
||||
"legendFormat": "Not Enabled [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"axisLabel": "Peer Count",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Reduce-Relay Missing-Tx Frequency",
|
||||
"description": "Frequency of on-demand transaction fetches triggered when a peer is missing a relayed transaction. A rising value means the suppression is too aggressive and the on-demand fetch path is growing.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "xrpld_reduce_relay_metrics{metric=\"missing_tx_freq\",exported_instance=~\"$node\"}",
|
||||
"legendFormat": "Missing Tx Freq [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"axisLabel": "Frequency",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
|
||||
@@ -2051,6 +2051,91 @@
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Ledger Acquire Duration (Inbound Fetch)",
|
||||
"description": "p95/p50 duration of the ledger.acquire span (InboundLedger): how long it takes to fetch a missing ledger from peers. A spike signals the node is falling behind or recovering from a fork. Populated under back-fill / sync activity.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 126
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))",
|
||||
"legendFormat": "P95 Acquire [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m])))",
|
||||
"legendFormat": "P50 Acquire [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"axisLabel": "Duration (ms)",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Ledger Acquire Rate by Outcome",
|
||||
"description": "Rate of completed ledger.acquire spans broken down by outcome (complete / failed). A rising failed rate indicates the node cannot fetch needed ledgers from its peers. Requires the outcome spanmetrics dimension.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 126
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (outcome, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"ledger.acquire\"}[5m]))",
|
||||
"legendFormat": "{{outcome}} [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Acquisitions / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
|
||||
@@ -375,6 +375,84 @@
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Queue Abandonment Rate (Expired)",
|
||||
"description": "Rate of transactions expired out of the queue (LastLedgerSequence passed). Rising expiry means submitters under-bid the escalating fee and gave up \u2014 a demand-frustration signal.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(xrpld_txq_expired_total{exported_instance=~\"$node\"}[5m]))",
|
||||
"legendFormat": "Expired / Sec [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Expired / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Queue Admission Rejections (Dropped)",
|
||||
"description": "Rate of transactions refused admission to the queue, by reason. queue_full means the queue is at capacity \u2014 admission-control backpressure distinct from expiry and from job-queue overflow.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (reason, exported_instance) (rate(xrpld_txq_dropped_total{exported_instance=~\"$node\"}[5m]))",
|
||||
"legendFormat": "{{reason}} [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Dropped / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
|
||||
@@ -106,6 +106,9 @@ connectors:
|
||||
- name: method
|
||||
- name: grpc_role
|
||||
- name: grpc_status
|
||||
# ledger.acquire dimensions (bounded: outcome, acquire reason).
|
||||
- name: outcome
|
||||
- name: acquire_reason
|
||||
|
||||
exporters:
|
||||
debug:
|
||||
|
||||
Reference in New Issue
Block a user