From dfd052a87e81101f0dddaa7046870cb01288e8cb Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Tue, 10 Mar 2026 18:52:45 +0000 Subject: [PATCH] Phase 9: Add node template variable and instance filters to Grafana dashboards Add $node template variable (exported_instance) to rippled-fee-market, rippled-job-queue, and rippled-rpc-perf dashboards enabling multi-node filtering. Add $job_type variable to job-queue and $method variable to rpc-perf dashboards. Inject exported_instance=~"$node" filter into all PromQL queries across these dashboards including rate(), histogram_quantile(), topk(), and sum() expressions. Also add the instance filter to Phase 9 panels (NodeStore, Cache, CountedObjects) in system-node-health dashboard. Co-Authored-By: Claude Opus 4.6 --- .../dashboards/rippled-fee-market.json | 53 ++++++++++----- .../grafana/dashboards/rippled-job-queue.json | 67 +++++++++++++++---- .../grafana/dashboards/rippled-rpc-perf.json | 63 ++++++++++++++--- .../dashboards/system-node-health.json | 24 +++---- 4 files changed, 155 insertions(+), 52 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/rippled-fee-market.json b/docker/telemetry/grafana/dashboards/rippled-fee-market.json index 7ff6dd65c3..1095643486 100644 --- a/docker/telemetry/grafana/dashboards/rippled-fee-market.json +++ b/docker/telemetry/grafana/dashboards/rippled-fee-market.json @@ -30,14 +30,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_txq_metrics{metric=\"txq_count\"}", + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_count\"}", "legendFormat": "Queue Depth" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_txq_metrics{metric=\"txq_max_size\"}", + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_max_size\"}", "legendFormat": "Max Capacity" } ], @@ -77,14 +77,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_txq_metrics{metric=\"txq_in_ledger\"}", + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_in_ledger\"}", "legendFormat": "In Ledger" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_txq_metrics{metric=\"txq_per_ledger\"}", + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_per_ledger\"}", "legendFormat": "Expected Per Ledger" } ], @@ -124,28 +124,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_txq_metrics{metric=\"txq_reference_fee_level\"}", + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_reference_fee_level\"}", "legendFormat": "Reference Fee Level" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_txq_metrics{metric=\"txq_min_processing_fee_level\"}", + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_min_processing_fee_level\"}", "legendFormat": "Min Processing Fee Level" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_txq_metrics{metric=\"txq_med_fee_level\"}", + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_med_fee_level\"}", "legendFormat": "Median Fee Level" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_txq_metrics{metric=\"txq_open_ledger_fee_level\"}", + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_open_ledger_fee_level\"}", "legendFormat": "Open Ledger Fee Level" } ], @@ -189,28 +189,28 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_load_factor_metrics{metric=\"load_factor\"}", + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor\"}", "legendFormat": "Combined Load Factor" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_load_factor_metrics{metric=\"load_factor_server\"}", + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_server\"}", "legendFormat": "Server" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_load_factor_metrics{metric=\"load_factor_fee_escalation\"}", + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_fee_escalation\"}", "legendFormat": "Fee Escalation" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_load_factor_metrics{metric=\"load_factor_fee_queue\"}", + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_fee_queue\"}", "legendFormat": "Fee Queue" } ], @@ -266,21 +266,21 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_load_factor_metrics{metric=\"load_factor_local\"}", + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_local\"}", "legendFormat": "Local" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_load_factor_metrics{metric=\"load_factor_net\"}", + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_net\"}", "legendFormat": "Network" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_load_factor_metrics{metric=\"load_factor_cluster\"}", + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_cluster\"}", "legendFormat": "Cluster" } ], @@ -303,7 +303,28 @@ "schemaVersion": 39, "tags": ["rippled", "otel", "fee-market"], "templating": { - "list": [] + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] }, "time": { "from": "now-1h", diff --git a/docker/telemetry/grafana/dashboards/rippled-job-queue.json b/docker/telemetry/grafana/dashboards/rippled-job-queue.json index 1f3a30ca75..f3ae5becf6 100644 --- a/docker/telemetry/grafana/dashboards/rippled-job-queue.json +++ b/docker/telemetry/grafana/dashboards/rippled-job-queue.json @@ -30,21 +30,21 @@ "datasource": { "type": "prometheus" }, - "expr": "sum(rate(rippled_job_queued_total[5m]))", + "expr": "sum(rate(rippled_job_queued_total{exported_instance=~\"$node\"}[5m]))", "legendFormat": "Queued/s" }, { "datasource": { "type": "prometheus" }, - "expr": "sum(rate(rippled_job_started_total[5m]))", + "expr": "sum(rate(rippled_job_started_total{exported_instance=~\"$node\"}[5m]))", "legendFormat": "Started/s" }, { "datasource": { "type": "prometheus" }, - "expr": "sum(rate(rippled_job_finished_total[5m]))", + "expr": "sum(rate(rippled_job_finished_total{exported_instance=~\"$node\"}[5m]))", "legendFormat": "Finished/s" } ], @@ -89,7 +89,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, rate(rippled_job_queued_total[5m]))", + "expr": "topk(10, rate(rippled_job_queued_total{exported_instance=~\"$node\"}[5m]))", "legendFormat": "{{job_type}}" } ], @@ -134,7 +134,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, rate(rippled_job_finished_total[5m]))", + "expr": "topk(10, rate(rippled_job_finished_total{exported_instance=~\"$node\"}[5m]))", "legendFormat": "{{job_type}}" } ], @@ -174,21 +174,21 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_job_queued_duration_us_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "p50" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))", + "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_job_queued_duration_us_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "p95" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m])))", + "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_job_queued_duration_us_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "p99" } ], @@ -228,21 +228,21 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_job_running_duration_us_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "p50" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))", + "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_job_running_duration_us_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "p95" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_job_running_duration_us_bucket[5m])))", + "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_job_running_duration_us_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "p99" } ], @@ -287,7 +287,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, histogram_quantile(0.95, sum by (le, job_type) (rate(rippled_job_running_duration_us_bucket[5m]))))", + "expr": "topk(10, histogram_quantile(0.95, sum by (le, job_type) (rate(rippled_job_running_duration_us_bucket{exported_instance=~\"$node\"}[5m]))))", "legendFormat": "{{job_type}}" } ], @@ -310,7 +310,48 @@ "schemaVersion": 39, "tags": ["rippled", "otel", "job-queue"], "templating": { - "list": [] + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "job_type", + "label": "Job Type", + "description": "Filter by job type", + "type": "query", + "query": "label_values(rippled_job_queued_total, job_type)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] }, "time": { "from": "now-1h", diff --git a/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json b/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json index d26eae7a6f..5714c6fd06 100644 --- a/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json +++ b/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json @@ -30,21 +30,21 @@ "datasource": { "type": "prometheus" }, - "expr": "sum(rate(rippled_rpc_method_started_total[5m]))", + "expr": "sum(rate(rippled_rpc_method_started_total{exported_instance=~\"$node\"}[5m]))", "legendFormat": "Started/s" }, { "datasource": { "type": "prometheus" }, - "expr": "sum(rate(rippled_rpc_method_finished_total[5m]))", + "expr": "sum(rate(rippled_rpc_method_finished_total{exported_instance=~\"$node\"}[5m]))", "legendFormat": "Finished/s" }, { "datasource": { "type": "prometheus" }, - "expr": "sum(rate(rippled_rpc_method_errored_total[5m]))", + "expr": "sum(rate(rippled_rpc_method_errored_total{exported_instance=~\"$node\"}[5m]))", "legendFormat": "Errored/s" } ], @@ -89,7 +89,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, rate(rippled_rpc_method_started_total[5m]))", + "expr": "topk(10, rate(rippled_rpc_method_started_total{exported_instance=~\"$node\"}[5m]))", "legendFormat": "{{method}}" } ], @@ -134,7 +134,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, rate(rippled_rpc_method_errored_total[5m]))", + "expr": "topk(10, rate(rippled_rpc_method_errored_total{exported_instance=~\"$node\"}[5m]))", "legendFormat": "{{method}}" } ], @@ -174,21 +174,21 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(rippled_rpc_method_duration_us_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "p50" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))", + "expr": "histogram_quantile(0.95, sum by (le) (rate(rippled_rpc_method_duration_us_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "p95" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_rpc_method_duration_us_bucket[5m])))", + "expr": "histogram_quantile(0.99, sum by (le) (rate(rippled_rpc_method_duration_us_bucket{exported_instance=~\"$node\"}[5m])))", "legendFormat": "p99" } ], @@ -233,7 +233,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, histogram_quantile(0.95, sum by (le, method) (rate(rippled_rpc_method_duration_us_bucket[5m]))))", + "expr": "topk(10, histogram_quantile(0.95, sum by (le, method) (rate(rippled_rpc_method_duration_us_bucket{exported_instance=~\"$node\"}[5m]))))", "legendFormat": "{{method}}" } ], @@ -278,7 +278,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(10, rate(rippled_rpc_method_errored_total[5m]) / (rate(rippled_rpc_method_started_total[5m]) > 0))", + "expr": "topk(10, rate(rippled_rpc_method_errored_total{exported_instance=~\"$node\"}[5m]) / (rate(rippled_rpc_method_started_total{exported_instance=~\"$node\"}[5m]) > 0))", "legendFormat": "{{method}}" } ], @@ -319,7 +319,48 @@ "schemaVersion": 39, "tags": ["rippled", "otel", "rpc"], "templating": { - "list": [] + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "method", + "label": "RPC Method", + "description": "Filter by RPC method", + "type": "query", + "query": "label_values(rippled_rpc_method_started_total, method)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] }, "time": { "from": "now-1h", diff --git a/docker/telemetry/grafana/dashboards/system-node-health.json b/docker/telemetry/grafana/dashboards/system-node-health.json index 0b46b45a6d..8cdb4e20bf 100644 --- a/docker/telemetry/grafana/dashboards/system-node-health.json +++ b/docker/telemetry/grafana/dashboards/system-node-health.json @@ -433,21 +433,21 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_nodestore_state{metric=\"node_reads_total\"}", + "expr": "rippled_nodestore_state{exported_instance=~\"$node\", metric=\"node_reads_total\"}", "legendFormat": "Reads Total" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_nodestore_state{metric=\"node_reads_hit\"}", + "expr": "rippled_nodestore_state{exported_instance=~\"$node\", metric=\"node_reads_hit\"}", "legendFormat": "Reads Hit (cache)" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_nodestore_state{metric=\"node_writes\"}", + "expr": "rippled_nodestore_state{exported_instance=~\"$node\", metric=\"node_writes\"}", "legendFormat": "Writes Total" } ], @@ -487,14 +487,14 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_nodestore_state{metric=\"write_load\"}", + "expr": "rippled_nodestore_state{exported_instance=~\"$node\", metric=\"write_load\"}", "legendFormat": "Write Load" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_nodestore_state{metric=\"read_queue\"}", + "expr": "rippled_nodestore_state{exported_instance=~\"$node\", metric=\"read_queue\"}", "legendFormat": "Read Queue" } ], @@ -562,21 +562,21 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_cache_metrics{metric=\"SLE_hit_rate\"}", + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"SLE_hit_rate\"}", "legendFormat": "SLE Hit Rate" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_cache_metrics{metric=\"ledger_hit_rate\"}", + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"ledger_hit_rate\"}", "legendFormat": "Ledger Hit Rate" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_cache_metrics{metric=\"AL_hit_rate\"}", + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"AL_hit_rate\"}", "legendFormat": "AcceptedLedger Hit Rate" } ], @@ -618,21 +618,21 @@ "datasource": { "type": "prometheus" }, - "expr": "rippled_cache_metrics{metric=\"treenode_cache_size\"}", + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"treenode_cache_size\"}", "legendFormat": "TreeNode Cache" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_cache_metrics{metric=\"treenode_track_size\"}", + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"treenode_track_size\"}", "legendFormat": "TreeNode Track" }, { "datasource": { "type": "prometheus" }, - "expr": "rippled_cache_metrics{metric=\"fullbelow_size\"}", + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"fullbelow_size\"}", "legendFormat": "FullBelow" } ], @@ -689,7 +689,7 @@ "datasource": { "type": "prometheus" }, - "expr": "topk(15, rippled_object_count)", + "expr": "topk(15, rippled_object_count{exported_instance=~\"$node\"})", "legendFormat": "{{type}}" } ],