From 0ba31c88cfdca3a8d030e11005f742e17e439d20 Mon Sep 17 00:00:00 2001
From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
Date: Fri, 6 Mar 2026 17:33:56 +0000
Subject: [PATCH] Add Grafana dashboard template variables for node and metric
 filtering

- Add resource_metrics_key_attributes to spanmetrics connector so
  service.instance.id becomes a Prometheus label for per-node filtering
- Add 'node' dropdown (service_instance_id) to all 3 dashboards
- Add 'command' dropdown (xrpl_rpc_command) to RPC Performance
- Add 'tx_origin' dropdown (xrpl_tx_local) to Transaction Overview
- Add 'consensus_mode' dropdown (xrpl_consensus_mode) to Consensus Health
- Update all panel PromQL queries to include $node filter

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../grafana/dashboards/consensus-health.json  | 144 +++++++++++++++---
 .../grafana/dashboards/rpc-performance.json   | 121 ++++++++++++---
 .../dashboards/transaction-overview.json      | 118 +++++++++++---
 docker/telemetry/otel-collector-config.yaml   |   4 +
 4 files changed, 325 insertions(+), 62 deletions(-)

diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json
index d9fe94248f..00808ddc15 100644
--- a/docker/telemetry/grafana/dashboards/consensus-health.json
+++ b/docker/telemetry/grafana/dashboards/consensus-health.json
@@ -1,5 +1,7 @@
 {
-  "annotations": { "list": [] },
+  "annotations": {
+    "list": []
+  },
   "editable": true,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 1,
@@ -9,16 +11,25 @@
     {
       "title": "Consensus Round Duration",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept\"}[5m])))",
           "legendFormat": "p95 round duration"
         },
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept\"}[5m])))",
           "legendFormat": "p50 round duration"
         }
       ],
@@ -32,11 +43,18 @@
     {
       "title": "Consensus Proposals Sent Rate",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.proposal.send\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.proposal.send\"}[5m]))",
           "legendFormat": "proposals/sec"
         }
       ],
@@ -50,11 +68,18 @@
     {
       "title": "Ledger Close Duration",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.ledger_close\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.ledger_close\"}[5m])))",
           "legendFormat": "p95 close duration"
         }
       ],
@@ -68,11 +93,18 @@
     {
       "title": "Validation Send Rate",
       "type": "stat",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.validation.send\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.validation.send\"}[5m]))",
           "legendFormat": "validations/sec"
         }
       ],
@@ -87,16 +119,25 @@
       "title": "Ledger Apply Duration (doAccept)",
       "description": "Time spent applying the consensus result to build a new ledger. Measured by the consensus.accept.apply span in doAccept().",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept.apply\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))",
           "legendFormat": "p95 apply duration"
         },
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept.apply\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))",
           "legendFormat": "p50 apply duration"
         }
       ],
@@ -111,11 +152,18 @@
       "title": "Close Time Agreement",
       "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on xrpl.consensus.close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m]))",
           "legendFormat": "total rounds/sec"
         }
       ],
@@ -129,8 +177,54 @@
   ],
   "schemaVersion": 39,
   "tags": ["rippled", "consensus", "telemetry"],
-  "templating": { "list": [] },
-  "time": { "from": "now-1h", "to": "now" },
+  "templating": {
+    "list": [
+      {
+        "name": "node",
+        "label": "Node",
+        "description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total, service_instance_id)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      },
+      {
+        "name": "consensus_mode",
+        "label": "Consensus Mode",
+        "description": "Filter by consensus mode (proposing, observing, wrongLedger, switchedLedger)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, xrpl_consensus_mode)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
   "title": "rippled Consensus Health",
   "uid": "rippled-consensus"
 }
diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json
index 535a03c870..1808cbfb91 100644
--- a/docker/telemetry/grafana/dashboards/rpc-performance.json
+++ b/docker/telemetry/grafana/dashboards/rpc-performance.json
@@ -1,5 +1,7 @@
 {
-  "annotations": { "list": [] },
+  "annotations": {
+    "list": []
+  },
   "editable": true,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 1,
@@ -9,11 +11,18 @@
     {
       "title": "RPC Request Rate by Command",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m]))",
           "legendFormat": "{{xrpl_rpc_command}}"
         }
       ],
@@ -27,11 +36,18 @@
     {
       "title": "RPC Latency p95 by Command",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m])))",
           "legendFormat": "p95 {{xrpl_rpc_command}}"
         }
       ],
@@ -45,11 +61,18 @@
     {
       "title": "RPC Error Rate",
       "type": "bargauge",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}[5m])) * 100",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100",
           "legendFormat": "{{xrpl_rpc_command}}"
         }
       ],
@@ -58,9 +81,18 @@
           "unit": "percent",
           "thresholds": {
             "steps": [
-              { "color": "green", "value": null },
-              { "color": "yellow", "value": 1 },
-              { "color": "red", "value": 5 }
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
             ]
           }
         },
@@ -70,11 +102,18 @@
     {
       "title": "RPC Latency Heatmap",
       "type": "heatmap",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])) by (le)",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m])) by (le)",
           "legendFormat": "{{le}}",
           "format": "heatmap"
         }
@@ -83,8 +122,54 @@
   ],
   "schemaVersion": 39,
   "tags": ["rippled", "rpc", "telemetry"],
-  "templating": { "list": [] },
-  "time": { "from": "now-1h", "to": "now" },
+  "templating": {
+    "list": [
+      {
+        "name": "node",
+        "label": "Node",
+        "description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total, service_instance_id)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      },
+      {
+        "name": "command",
+        "label": "RPC Command",
+        "description": "Filter by RPC command name (e.g., server_info, submit)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}, xrpl_rpc_command)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
   "title": "rippled RPC Performance",
   "uid": "rippled-rpc-perf"
 }
diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json
index d5a60a6fa1..dddf94ffbf 100644
--- a/docker/telemetry/grafana/dashboards/transaction-overview.json
+++ b/docker/telemetry/grafana/dashboards/transaction-overview.json
@@ -1,5 +1,7 @@
 {
-  "annotations": { "list": [] },
+  "annotations": {
+    "list": []
+  },
   "editable": true,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 1,
@@ -9,16 +11,25 @@
     {
       "title": "Transaction Processing Rate",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.process\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m]))",
           "legendFormat": "tx.process/sec"
         },
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.receive\"}[5m]))",
           "legendFormat": "tx.receive/sec"
         }
       ],
@@ -32,16 +43,25 @@
     {
       "title": "Transaction Processing Latency",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m])))",
           "legendFormat": "p95"
         },
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m])))",
           "legendFormat": "p50"
         }
       ],
@@ -55,11 +75,18 @@
     {
       "title": "Transaction Path Distribution",
       "type": "piechart",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name=\"tx.process\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m]))",
           "legendFormat": "local={{xrpl_tx_local}}"
         }
       ]
@@ -67,11 +94,18 @@
     {
       "title": "Transaction Receive vs Suppressed",
       "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
       "targets": [
         {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.receive\"}[5m]))",
           "legendFormat": "total received"
         }
       ],
@@ -85,8 +119,54 @@
   ],
   "schemaVersion": 39,
   "tags": ["rippled", "transactions", "telemetry"],
-  "templating": { "list": [] },
-  "time": { "from": "now-1h", "to": "now" },
+  "templating": {
+    "list": [
+      {
+        "name": "node",
+        "label": "Node",
+        "description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total, service_instance_id)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      },
+      {
+        "name": "tx_origin",
+        "label": "TX Origin",
+        "description": "Filter by transaction origin (true = local submit, false = peer relay)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total{span_name=\"tx.process\"}, xrpl_tx_local)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
   "title": "rippled Transaction Overview",
   "uid": "rippled-transactions"
 }
diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml
index 72ac4059cc..6c80b5f6d1 100644
--- a/docker/telemetry/otel-collector-config.yaml
+++ b/docker/telemetry/otel-collector-config.yaml
@@ -23,6 +23,10 @@ processors:
 
 connectors:
   spanmetrics:
+    # Expose service.instance.id (node public key) as a Prometheus label so
+    # Grafana dashboards can filter metrics by individual node.
+    resource_metrics_key_attributes:
+      - service.instance.id
     histogram:
       explicit:
         buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]