Add Grafana dashboard template variables for node and metric filtering

- Add resource_metrics_key_attributes to spanmetrics connector so service.instance.id becomes a Prometheus label for per-node filtering - Add 'node' dropdown (service_instance_id) to all 3 dashboards - Add 'command' dropdown (xrpl_rpc_command) to RPC Performance - Add 'tx_origin' dropdown (xrpl_tx_local) to Transaction Overview - Add 'consensus_mode' dropdown (xrpl_consensus_mode) to Consensus Health - Update all panel PromQL queries to include $node filter Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 23:32:26 +00:00 · 2026-03-06 17:33:56 +00:00
parent 3c550ea6ce
commit 0ba31c88cf
4 changed files with 325 additions and 62 deletions
--- a/docker/telemetry/grafana/dashboards/consensus-health.json
+++ b/docker/telemetry/grafana/dashboards/consensus-health.json
@@ -1,5 +1,7 @@
 {
-  "annotations": { "list": [] },
+  "annotations": {
+    "list": []
+  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 1,
@@ -9,16 +11,25 @@
    {
      "title": "Consensus Round Duration",
      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept\"}[5m])))",
          "legendFormat": "p95 round duration"
        },
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept\"}[5m])))",
          "legendFormat": "p50 round duration"
        }
      ],
@@ -32,11 +43,18 @@
    {
      "title": "Consensus Proposals Sent Rate",
      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.proposal.send\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.proposal.send\"}[5m]))",
          "legendFormat": "proposals/sec"
        }
      ],
@@ -50,11 +68,18 @@
    {
      "title": "Ledger Close Duration",
      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.ledger_close\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.ledger_close\"}[5m])))",
          "legendFormat": "p95 close duration"
        }
      ],
@@ -68,11 +93,18 @@
    {
      "title": "Validation Send Rate",
      "type": "stat",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.validation.send\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.validation.send\"}[5m]))",
          "legendFormat": "validations/sec"
        }
      ],
@@ -87,16 +119,25 @@
      "title": "Ledger Apply Duration (doAccept)",
      "description": "Time spent applying the consensus result to build a new ledger. Measured by the consensus.accept.apply span in doAccept().",
      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept.apply\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))",
          "legendFormat": "p95 apply duration"
        },
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"consensus.accept.apply\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))",
          "legendFormat": "p50 apply duration"
        }
      ],
@@ -111,11 +152,18 @@
      "title": "Close Time Agreement",
      "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on xrpl.consensus.close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).",
      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"consensus.accept.apply\"}[5m]))",
          "legendFormat": "total rounds/sec"
        }
      ],
@@ -129,8 +177,54 @@
  ],
  "schemaVersion": 39,
  "tags": ["rippled", "consensus", "telemetry"],
-  "templating": { "list": [] },
-  "time": { "from": "now-1h", "to": "now" },
+  "templating": {
+    "list": [
+      {
+        "name": "node",
+        "label": "Node",
+        "description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total, service_instance_id)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      },
+      {
+        "name": "consensus_mode",
+        "label": "Consensus Mode",
+        "description": "Filter by consensus mode (proposing, observing, wrongLedger, switchedLedger)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, xrpl_consensus_mode)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
  "title": "rippled Consensus Health",
  "uid": "rippled-consensus"
 }
--- a/docker/telemetry/grafana/dashboards/rpc-performance.json
+++ b/docker/telemetry/grafana/dashboards/rpc-performance.json
@@ -1,5 +1,7 @@
 {
-  "annotations": { "list": [] },
+  "annotations": {
+    "list": []
+  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 1,
@@ -9,11 +11,18 @@
    {
      "title": "RPC Request Rate by Command",
      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m]))",
          "legendFormat": "{{xrpl_rpc_command}}"
        }
      ],
@@ -27,11 +36,18 @@
    {
      "title": "RPC Latency p95 by Command",
      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m])))",
          "legendFormat": "p95 {{xrpl_rpc_command}}"
        }
      ],
@@ -45,11 +61,18 @@
    {
      "title": "RPC Error Rate",
      "type": "bargauge",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}[5m])) * 100",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100",
          "legendFormat": "{{xrpl_rpc_command}}"
        }
      ],
@@ -58,9 +81,18 @@
          "unit": "percent",
          "thresholds": {
            "steps": [
-              { "color": "green", "value": null },
-              { "color": "yellow", "value": 1 },
-              { "color": "red", "value": 5 }
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
            ]
          }
        },
@@ -70,11 +102,18 @@
    {
      "title": "RPC Latency Heatmap",
      "type": "heatmap",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~\"rpc.command.*\"}[5m])) by (le)",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", service_instance_id=~\"$node\", span_name=~\"rpc.command.*\"}[5m])) by (le)",
          "legendFormat": "{{le}}",
          "format": "heatmap"
        }
@@ -83,8 +122,54 @@
  ],
  "schemaVersion": 39,
  "tags": ["rippled", "rpc", "telemetry"],
-  "templating": { "list": [] },
-  "time": { "from": "now-1h", "to": "now" },
+  "templating": {
+    "list": [
+      {
+        "name": "node",
+        "label": "Node",
+        "description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total, service_instance_id)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      },
+      {
+        "name": "command",
+        "label": "RPC Command",
+        "description": "Filter by RPC command name (e.g., server_info, submit)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}, xrpl_rpc_command)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
  "title": "rippled RPC Performance",
  "uid": "rippled-rpc-perf"
 }
--- a/docker/telemetry/grafana/dashboards/transaction-overview.json
+++ b/docker/telemetry/grafana/dashboards/transaction-overview.json
@@ -1,5 +1,7 @@
 {
-  "annotations": { "list": [] },
+  "annotations": {
+    "list": []
+  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 1,
@@ -9,16 +11,25 @@
    {
      "title": "Transaction Processing Rate",
      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.process\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m]))",
          "legendFormat": "tx.process/sec"
        },
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.receive\"}[5m]))",
          "legendFormat": "tx.receive/sec"
        }
      ],
@@ -32,16 +43,25 @@
    {
      "title": "Transaction Processing Latency",
      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m])))",
          "legendFormat": "p95"
        },
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"tx.process\"}[5m])))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m])))",
          "legendFormat": "p50"
        }
      ],
@@ -55,11 +75,18 @@
    {
      "title": "Transaction Path Distribution",
      "type": "piechart",
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name=\"tx.process\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.process\"}[5m]))",
          "legendFormat": "local={{xrpl_tx_local}}"
        }
      ]
@@ -67,11 +94,18 @@
    {
      "title": "Transaction Receive vs Suppressed",
      "type": "timeseries",
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
      "targets": [
        {
-          "datasource": { "type": "prometheus" },
-          "expr": "sum(rate(traces_span_metrics_calls_total{span_name=\"tx.receive\"}[5m]))",
+          "datasource": {
+            "type": "prometheus"
+          },
+          "expr": "sum(rate(traces_span_metrics_calls_total{service_instance_id=~\"$node\", span_name=\"tx.receive\"}[5m]))",
          "legendFormat": "total received"
        }
      ],
@@ -85,8 +119,54 @@
  ],
  "schemaVersion": 39,
  "tags": ["rippled", "transactions", "telemetry"],
-  "templating": { "list": [] },
-  "time": { "from": "now-1h", "to": "now" },
+  "templating": {
+    "list": [
+      {
+        "name": "node",
+        "label": "Node",
+        "description": "Filter by rippled node (service.instance.id \u2014 defaults to node public key)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total, service_instance_id)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      },
+      {
+        "name": "tx_origin",
+        "label": "TX Origin",
+        "description": "Filter by transaction origin (true = local submit, false = peer relay)",
+        "type": "query",
+        "query": "label_values(traces_span_metrics_calls_total{span_name=\"tx.process\"}, xrpl_tx_local)",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "includeAll": true,
+        "allValue": ".*",
+        "current": {
+          "text": "All",
+          "value": "$__all"
+        },
+        "multi": true,
+        "refresh": 2,
+        "sort": 1
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
  "title": "rippled Transaction Overview",
  "uid": "rippled-transactions"
 }
--- a/docker/telemetry/otel-collector-config.yaml
+++ b/docker/telemetry/otel-collector-config.yaml
@@ -23,6 +23,10 @@ processors:

 connectors:
  spanmetrics:
+    # Expose service.instance.id (node public key) as a Prometheus label so
+    # Grafana dashboards can filter metrics by individual node.
+    resource_metrics_key_attributes:
+      - service.instance.id
    histogram:
      explicit:
        buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]