Add Grafana dashboard template variables for node and metric filtering

- Add resource_metrics_key_attributes to spanmetrics connector so
  service.instance.id becomes a Prometheus label for per-node filtering
- Add 'node' dropdown (service_instance_id) to all 3 dashboards
- Add 'command' dropdown (xrpl_rpc_command) to RPC Performance
- Add 'tx_origin' dropdown (xrpl_tx_local) to Transaction Overview
- Add 'consensus_mode' dropdown (xrpl_consensus_mode) to Consensus Health
- Update all panel PromQL queries to include $node filter

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-03-06 17:33:56 +00:00
parent 3c550ea6ce
commit 0ba31c88cf
4 changed files with 325 additions and 62 deletions

View File

@@ -1,5 +1,7 @@
{
"annotations": { "list": [] },
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
@@ -9,16 +11,25 @@
{
"title": "Consensus Round Duration",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))",
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept\"}[5m])))",
"legendFormat": "p95 round duration"
},
{
"datasource": { "type": "prometheus" },
"expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))",
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept\"}[5m])))",
"legendFormat": "p50 round duration"
}
],
@@ -32,11 +43,18 @@
{
"title": "Consensus Proposals Sent Rate",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.proposal.send\"}[5m]))",
"datasource": {
"type": "prometheus"
},
"expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.proposal.send\"}[5m]))",
"legendFormat": "proposals/sec"
}
],
@@ -50,11 +68,18 @@
{
"title": "Ledger Close Duration",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.ledger_close\"}[5m])))",
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.ledger_close\"}[5m])))",
"legendFormat": "p95 close duration"
}
],
@@ -68,11 +93,18 @@
{
"title": "Validation Send Rate",
"type": "stat",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.validation.send\"}[5m]))",
"datasource": {
"type": "prometheus"
},
"expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.validation.send\"}[5m]))",
"legendFormat": "validations/sec"
}
],
@@ -87,16 +119,25 @@
"title": "Ledger Apply Duration (doAccept)",
"description": "Time spent applying the consensus result to build a new ledger. Measured by the consensus.accept.apply span in doAccept().",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept.apply\"}[5m])))",
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))",
"legendFormat": "p95 apply duration"
},
{
"datasource": { "type": "prometheus" },
"expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept.apply\"}[5m])))",
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))",
"legendFormat": "p50 apply duration"
}
],
@@ -111,11 +152,18 @@
"title": "Close Time Agreement",
"description": "Rate of close time agreement vs disagreement across consensus rounds. Based on xrpl.consensus.close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\"}[5m]))",
"datasource": {
"type": "prometheus"
},
"expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m]))",
"legendFormat": "total rounds/sec"
}
],
@@ -129,8 +177,54 @@
],
"schemaVersion": 39,
"tags": ["rippled", "consensus", "telemetry"],
"templating": { "list": [] },
"time": { "from": "now-1h", "to": "now" },
"templating": {
"list": [
{
"name": "node",
"label": "Node",
"description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)",
"type": "query",
"query": "label_values(traces_span_metrics_calls_total, service_instance_id)",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"includeAll": true,
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"multi": true,
"refresh": 2,
"sort": 1
},
{
"name": "consensus_mode",
"label": "Consensus Mode",
"description": "Filter by consensus mode (proposing, observing, wrongLedger, switchedLedger)",
"type": "query",
"query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, xrpl_consensus_mode)",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"includeAll": true,
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"multi": true,
"refresh": 2,
"sort": 1
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"title": "rippled Consensus Health",
"uid": "rippled-consensus"
}

View File

@@ -1,5 +1,7 @@
{
"annotations": { "list": [] },
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
@@ -9,11 +11,18 @@
{
"title": "RPC Request Rate by Command",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}[5m]))",
"datasource": {
"type": "prometheus"
},
"expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m]))",
"legendFormat": "{{xrpl_rpc_command}}"
}
],
@@ -27,11 +36,18 @@
{
"title": "RPC Latency p95 by Command",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])))",
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m])))",
"legendFormat": "p95 {{xrpl_rpc_command}}"
}
],
@@ -45,11 +61,18 @@
{
"title": "RPC Error Rate",
"type": "bargauge",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}[5m])) * 100",
"datasource": {
"type": "prometheus"
},
"expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100",
"legendFormat": "{{xrpl_rpc_command}}"
}
],
@@ -58,9 +81,18 @@
"unit": "percent",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 5 }
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 5
}
]
}
},
@@ -70,11 +102,18 @@
{
"title": "RPC Latency Heatmap",
"type": "heatmap",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])) by (le)",
"datasource": {
"type": "prometheus"
},
"expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m])) by (le)",
"legendFormat": "{{le}}",
"format": "heatmap"
}
@@ -83,8 +122,54 @@
],
"schemaVersion": 39,
"tags": ["rippled", "rpc", "telemetry"],
"templating": { "list": [] },
"time": { "from": "now-1h", "to": "now" },
"templating": {
"list": [
{
"name": "node",
"label": "Node",
"description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)",
"type": "query",
"query": "label_values(traces_span_metrics_calls_total, service_instance_id)",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"includeAll": true,
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"multi": true,
"refresh": 2,
"sort": 1
},
{
"name": "command",
"label": "RPC Command",
"description": "Filter by RPC command name (e.g., server_info, submit)",
"type": "query",
"query": "label_values(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}, xrpl_rpc_command)",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"includeAll": true,
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"multi": true,
"refresh": 2,
"sort": 1
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"title": "rippled RPC Performance",
"uid": "rippled-rpc-perf"
}

View File

@@ -1,5 +1,7 @@
{
"annotations": { "list": [] },
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
@@ -9,16 +11,25 @@
{
"title": "Transaction Processing Rate",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.process\"}[5m]))",
"datasource": {
"type": "prometheus"
},
"expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m]))",
"legendFormat": "tx.process/sec"
},
{
"datasource": { "type": "prometheus" },
"expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))",
"datasource": {
"type": "prometheus"
},
"expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.receive\"}[5m]))",
"legendFormat": "tx.receive/sec"
}
],
@@ -32,16 +43,25 @@
{
"title": "Transaction Processing Latency",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))",
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m])))",
"legendFormat": "p95"
},
{
"datasource": { "type": "prometheus" },
"expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))",
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m])))",
"legendFormat": "p50"
}
],
@@ -55,11 +75,18 @@
{
"title": "Transaction Path Distribution",
"type": "piechart",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name=\"tx.process\"}[5m]))",
"datasource": {
"type": "prometheus"
},
"expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m]))",
"legendFormat": "local={{xrpl_tx_local}}"
}
]
@@ -67,11 +94,18 @@
{
"title": "Transaction Receive vs Suppressed",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"targets": [
{
"datasource": { "type": "prometheus" },
"expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))",
"datasource": {
"type": "prometheus"
},
"expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.receive\"}[5m]))",
"legendFormat": "total received"
}
],
@@ -85,8 +119,54 @@
],
"schemaVersion": 39,
"tags": ["rippled", "transactions", "telemetry"],
"templating": { "list": [] },
"time": { "from": "now-1h", "to": "now" },
"templating": {
"list": [
{
"name": "node",
"label": "Node",
"description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)",
"type": "query",
"query": "label_values(traces_span_metrics_calls_total, service_instance_id)",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"includeAll": true,
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"multi": true,
"refresh": 2,
"sort": 1
},
{
"name": "tx_origin",
"label": "TX Origin",
"description": "Filter by transaction origin (true = local submit, false = peer relay)",
"type": "query",
"query": "label_values(traces_span_metrics_calls_total{span_name=\"tx.process\"}, xrpl_tx_local)",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"includeAll": true,
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"multi": true,
"refresh": 2,
"sort": 1
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"title": "rippled Transaction Overview",
"uid": "rippled-transactions"
}

View File

@@ -23,6 +23,10 @@ processors:
connectors:
spanmetrics:
# Expose service.instance.id (node public key) as a Prometheus label so
# Grafana dashboards can filter metrics by individual node.
resource_metrics_key_attributes:
- service.instance.id
histogram:
explicit:
buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]