From 0ba31c88cfdca3a8d030e11005f742e17e439d20 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 6 Mar 2026 17:33:56 +0000 Subject: [PATCH] Add Grafana dashboard template variables for node and metric filtering - Add resource_metrics_key_attributes to spanmetrics connector so service.instance.id becomes a Prometheus label for per-node filtering - Add 'node' dropdown (service_instance_id) to all 3 dashboards - Add 'command' dropdown (xrpl_rpc_command) to RPC Performance - Add 'tx_origin' dropdown (xrpl_tx_local) to Transaction Overview - Add 'consensus_mode' dropdown (xrpl_consensus_mode) to Consensus Health - Update all panel PromQL queries to include $node filter Co-Authored-By: Claude Opus 4.6 --- .../grafana/dashboards/consensus-health.json | 144 +++++++++++++++--- .../grafana/dashboards/rpc-performance.json | 121 ++++++++++++--- .../dashboards/transaction-overview.json | 118 +++++++++++--- docker/telemetry/otel-collector-config.yaml | 4 + 4 files changed, 325 insertions(+), 62 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index d9fe94248f..00808ddc15 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -1,5 +1,7 @@ { - "annotations": { "list": [] }, + "annotations": { + "list": [] + }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -9,16 +11,25 @@ { "title": "Consensus Round Duration", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))", + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept\"}[5m])))", "legendFormat": "p95 round duration" }, { - "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))", + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept\"}[5m])))", "legendFormat": "p50 round duration" } ], @@ -32,11 +43,18 @@ { "title": "Consensus Proposals Sent Rate", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.proposal.send\"}[5m]))", + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.proposal.send\"}[5m]))", "legendFormat": "proposals/sec" } ], @@ -50,11 +68,18 @@ { "title": "Ledger Close Duration", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.ledger_close\"}[5m])))", + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.ledger_close\"}[5m])))", "legendFormat": "p95 close duration" } ], @@ -68,11 +93,18 @@ { "title": "Validation Send Rate", "type": "stat", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.validation.send\"}[5m]))", + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.validation.send\"}[5m]))", "legendFormat": "validations/sec" } ], @@ -87,16 +119,25 @@ "title": "Ledger Apply Duration (doAccept)", "description": "Time spent applying the consensus result to build a new ledger. Measured by the consensus.accept.apply span in doAccept().", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept.apply\"}[5m])))", + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))", "legendFormat": "p95 apply duration" }, { - "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept.apply\"}[5m])))", + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))", "legendFormat": "p50 apply duration" } ], @@ -111,11 +152,18 @@ "title": "Close Time Agreement", "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on xrpl.consensus.close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\"}[5m]))", + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m]))", "legendFormat": "total rounds/sec" } ], @@ -129,8 +177,54 @@ ], "schemaVersion": 39, "tags": ["rippled", "consensus", "telemetry"], - "templating": { "list": [] }, - "time": { "from": "now-1h", "to": "now" }, + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total, service_instance_id)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "consensus_mode", + "label": "Consensus Mode", + "description": "Filter by consensus mode (proposing, observing, wrongLedger, switchedLedger)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, xrpl_consensus_mode)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, "title": "rippled Consensus Health", "uid": "rippled-consensus" } diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json index 535a03c870..1808cbfb91 100644 --- a/docker/telemetry/grafana/dashboards/rpc-performance.json +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -1,5 +1,7 @@ { - "annotations": { "list": [] }, + "annotations": { + "list": [] + }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -9,11 +11,18 @@ { "title": "RPC Request Rate by Command", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}[5m]))", + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m]))", "legendFormat": "{{xrpl_rpc_command}}" } ], @@ -27,11 +36,18 @@ { "title": "RPC Latency p95 by Command", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])))", + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m])))", "legendFormat": "p95 {{xrpl_rpc_command}}" } ], @@ -45,11 +61,18 @@ { "title": "RPC Error Rate", "type": "bargauge", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}[5m])) * 100", + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100", "legendFormat": "{{xrpl_rpc_command}}" } ], @@ -58,9 +81,18 @@ "unit": "percent", "thresholds": { "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 1 }, - { "color": "red", "value": 5 } + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } ] } }, @@ -70,11 +102,18 @@ { "title": "RPC Latency Heatmap", "type": "heatmap", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])) by (le)", + "datasource": { + "type": "prometheus" + }, + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m])) by (le)", "legendFormat": "{{le}}", "format": "heatmap" } @@ -83,8 +122,54 @@ ], "schemaVersion": 39, "tags": ["rippled", "rpc", "telemetry"], - "templating": { "list": [] }, - "time": { "from": "now-1h", "to": "now" }, + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total, service_instance_id)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "command", + "label": "RPC Command", + "description": "Filter by RPC command name (e.g., server_info, submit)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}, xrpl_rpc_command)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, "title": "rippled RPC Performance", "uid": "rippled-rpc-perf" } diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index d5a60a6fa1..dddf94ffbf 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -1,5 +1,7 @@ { - "annotations": { "list": [] }, + "annotations": { + "list": [] + }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -9,16 +11,25 @@ { "title": "Transaction Processing Rate", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.process\"}[5m]))", + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m]))", "legendFormat": "tx.process/sec" }, { - "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))", + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.receive\"}[5m]))", "legendFormat": "tx.receive/sec" } ], @@ -32,16 +43,25 @@ { "title": "Transaction Processing Latency", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))", + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m])))", "legendFormat": "p95" }, { - "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))", + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m])))", "legendFormat": "p50" } ], @@ -55,11 +75,18 @@ { "title": "Transaction Path Distribution", "type": "piechart", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name=\"tx.process\"}[5m]))", + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m]))", "legendFormat": "local={{xrpl_tx_local}}" } ] @@ -67,11 +94,18 @@ { "title": "Transaction Receive vs Suppressed", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, "targets": [ { - "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))", + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.receive\"}[5m]))", "legendFormat": "total received" } ], @@ -85,8 +119,54 @@ ], "schemaVersion": 39, "tags": ["rippled", "transactions", "telemetry"], - "templating": { "list": [] }, - "time": { "from": "now-1h", "to": "now" }, + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total, service_instance_id)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "tx_origin", + "label": "TX Origin", + "description": "Filter by transaction origin (true = local submit, false = peer relay)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"tx.process\"}, xrpl_tx_local)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, "title": "rippled Transaction Overview", "uid": "rippled-transactions" } diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index 72ac4059cc..6c80b5f6d1 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -23,6 +23,10 @@ processors: connectors: spanmetrics: + # Expose service.instance.id (node public key) as a Prometheus label so + # Grafana dashboards can filter metrics by individual node. + resource_metrics_key_attributes: + - service.instance.id histogram: explicit: buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]