From 93caaba5cacaec3a2218aaf40fca626737075e34 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 14 May 2026 12:33:18 +0100 Subject: [PATCH] =?UTF-8?q?fix(telemetry):=20recover=20Phase=206=20dashboa?= =?UTF-8?q?rd=20panels=20lost=20during=20statsd=E2=86=92system=20rename?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Panels 8-15 from statsd-node-health.json and panels 8-9 from statsd-network-traffic.json were lost when Phase 7 renamed these files to system-*. The merge (5cd71ed107) took Phase 7's smaller version without the extra panels added by commit b933e8ae00 on Phase 6. Recovered panels (system-node-health.json): - Key Jobs Execution Time (11 job types) - Key Jobs Dequeue Wait Time (11 job types) - FullBelowCache Size - FullBelowCache Hit Rate - Ledger Publish Gap (validated - published age delta) - State Duration Rate (Full vs Tracking) - All Jobs Execution Time Detail (34 job types) - All Jobs Dequeue Wait Detail (34 job types) Recovered panels (system-network-traffic.json): - Duplicate Traffic (Wasted Bandwidth) - All Traffic Categories Detail (topk 15 by byte rate) All recovered panels updated to include exported_instance=~"$node" filter per project dashboard guidelines. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../dashboards/system-network-traffic.json | 113 ++++ .../dashboards/system-node-health.json | 495 +++++++++++++++++- 2 files changed, 606 insertions(+), 2 deletions(-) diff --git a/docker/telemetry/grafana/dashboards/system-network-traffic.json b/docker/telemetry/grafana/dashboards/system-network-traffic.json index 82faa28476..7cc2dfd1ea 100644 --- a/docker/telemetry/grafana/dashboards/system-network-traffic.json +++ b/docker/telemetry/grafana/dashboards/system-network-traffic.json @@ -655,6 +655,119 @@ } ] } + }, + { + "title": "Duplicate Traffic (Wasted Bandwidth)", + "description": "Rate of duplicate overlay traffic across transaction, proposal, and validation categories. Duplicate messages are messages the node has already seen and discards. High duplicate rates indicate inefficient message routing or network topology issues causing redundant relays.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_transactions_duplicate_Bytes_In{exported_instance=~\"$node\"}[5m])", + "legendFormat": "TX Duplicate In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_transactions_duplicate_Bytes_Out{exported_instance=~\"$node\"}[5m])", + "legendFormat": "TX Duplicate Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_proposals_duplicate_Bytes_In{exported_instance=~\"$node\"}[5m])", + "legendFormat": "Proposals Duplicate In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_proposals_duplicate_Bytes_Out{exported_instance=~\"$node\"}[5m])", + "legendFormat": "Proposals Duplicate Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_validations_duplicate_Bytes_In{exported_instance=~\"$node\"}[5m])", + "legendFormat": "Validations Duplicate In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_validations_duplicate_Bytes_Out{exported_instance=~\"$node\"}[5m])", + "legendFormat": "Validations Duplicate Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "custom": { + "axisLabel": "Throughput", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "All Traffic Categories (Detail)", + "description": "Top 15 traffic categories by inbound byte rate, excluding the total aggregate. Provides a detailed timeseries view of which overlay message types are consuming the most bandwidth over time. Complements the bar gauge snapshot view in the Overlay Traffic panel.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(15, rate({__name__=~\"rippled_.*_Bytes_In\", __name__!~\"rippled_total_{exported_instance=~\"$node\"}.*\"}[5m]))", + "legendFormat": "{{__name__}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "custom": { + "axisLabel": "Throughput", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/system-node-health.json b/docker/telemetry/grafana/dashboards/system-node-health.json index 456c62b2e1..96775edb7e 100644 --- a/docker/telemetry/grafana/dashboards/system-node-health.json +++ b/docker/telemetry/grafana/dashboards/system-node-health.json @@ -160,7 +160,7 @@ ], "fieldConfig": { "defaults": { - "unit": "µs", + "unit": "\u00b5s", "custom": { "axisLabel": "Duration", "spanNulls": true, @@ -287,7 +287,7 @@ }, { "title": "Job Queue Depth", - "description": "Current number of jobs waiting in the job queue. Sourced from the job_count gauge (JobQueue.cpp:26). A sustained high value indicates the node cannot process work fast enough — common during ledger replay or heavy RPC load.", + "description": "Current number of jobs waiting in the job queue. Sourced from the job_count gauge (JobQueue.cpp:26). A sustained high value indicates the node cannot process work fast enough \u2014 common during ledger replay or heavy RPC load.", "type": "timeseries", "gridPos": { "h": 8, @@ -399,6 +399,497 @@ }, "overrides": [] } + }, + { + "title": "--- Extended Metrics (Recovered from Phase 6) ---", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "collapsed": false, + "panels": [] + }, + { + "title": "Key Jobs Execution Time", + "description": "Execution time for critical job types at the selected quantile. Sourced from per-job-type events in JobTypeData (JobTypeData.h:48). Shows how long key consensus, transaction, and maintenance jobs take to execute. Spikes indicate processing bottlenecks.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_acceptLedger{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Accept Ledger [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_advanceLedger{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Advance Ledger [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_transaction{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Transaction [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_writeObjects{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Write Objects [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_heartbeat{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Heartbeat [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_sweep{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Sweep [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_trustedValidation{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Trusted Validation [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_trustedProposal{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Trusted Proposal [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_publishNewLedger{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Publish New Ledger [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_clientRPC{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Client RPC [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledgerData{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Ledger Data [{{quantile}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Key Jobs Dequeue Wait Time", + "description": "Time spent waiting in the job queue before execution for critical job types. Sourced from per-job-type dequeue events (JobTypeData.h:47). High dequeue times indicate the job queue is backlogged and jobs are waiting too long to be scheduled.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 33 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_acceptLedger_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Accept Ledger [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_advanceLedger_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Advance Ledger [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_transaction_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Transaction [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_writeObjects_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Write Objects [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_heartbeat_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Heartbeat [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_sweep_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Sweep [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_trustedValidation_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Trusted Validation [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_trustedProposal_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Trusted Proposal [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_publishNewLedger_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Publish New Ledger [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_clientRPC_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Client RPC [{{quantile}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledgerData_q{quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "Ledger Data [{{quantile}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Wait Time (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "FullBelowCache Size", + "description": "Number of entries in the FullBelowCache. Sourced from the TaggedCache size gauge (TaggedCache.h:183) for the Node family full below cache (NodeFamily.cpp:29). This cache tracks which SHAMap nodes have all children present locally, avoiding redundant fetches during ledger acquisition.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 41 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_Node_family_full_below_cache_size{exported_instance=~\"$node\"}", + "legendFormat": "FullBelowCache Size" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Entries", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "FullBelowCache Hit Rate", + "description": "Hit rate percentage for the FullBelowCache. Sourced from the TaggedCache hit_rate gauge (TaggedCache.h:184). A high hit rate means the node is efficiently reusing cached knowledge about complete SHAMap subtrees. Low hit rates during steady state warrant investigation.", + "type": "gauge", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 41 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_Node_family_full_below_cache_hit_rate{exported_instance=~\"$node\"}", + "legendFormat": "Hit Rate" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 25 + }, + { + "color": "green", + "value": 50 + } + ] + } + }, + "overrides": [] + } + }, + { + "title": "Ledger Publish Gap", + "description": "Difference between published and validated ledger ages. Computed as Published_Ledger_Age minus Validated_Ledger_Age. A value near zero means the publish pipeline keeps up with validation. A growing gap indicates the publish pipeline is falling behind, potentially causing stale data for subscribers.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 49 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_LedgerMaster_Published_Ledger_Age{exported_instance=~\"$node\"} - rippled_LedgerMaster_Validated_Ledger_Age{exported_instance=~\"$node\"}", + "legendFormat": "Publish Gap" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + } + }, + { + "title": "State Duration Rate (Full vs Tracking)", + "description": "Rate of change of time spent in Full and Tracking operating modes, normalized to seconds. Sourced from State_Accounting duration gauges (NetworkOPs.cpp:774-778). In steady state the Full duration rate should be close to 1.0 (gaining one second of Full-mode time per wall-clock second). A drop below 1.0 means the node is spending time in other modes.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 49 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_State_Accounting_Full_duration{exported_instance=~\"$node\"}[5m]) / 1000000", + "legendFormat": "Full Mode Rate" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_State_Accounting_Tracking_duration{exported_instance=~\"$node\"}[5m]) / 1000000", + "legendFormat": "Tracking Mode Rate" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Rate (s/s)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "All Jobs Execution Time (Detail)", + "description": "Execution time for ALL non-special job types at the selected quantile. Shows the complete picture of job execution performance. Use the Key Jobs panel for a focused view of the most critical jobs.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 57 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "{__name__=~\"rippled_(makeFetchPack|publishAcqLedger|untrustedValidation|manifest|localTransaction|ledgerReplayRequest|ledgerRequest|untrustedProposal|ledgerReplayTask|ledgerData|clientCommand|clientSubscribe|clientFeeChange|clientConsensus|clientAccountHistory|clientRPC|clientWebsocket|RPC|updatePaths|transaction|batch|advanceLedger|publishNewLedger|fetchTxnData|writeAhead|trustedValidation|writeObjects|acceptLedger|trustedProposal|sweep|clusterReport|heartbeat|administration|handleHaveTransactions|doTransactions)\", quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "{{__name__}} [{{quantile}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "All Jobs Dequeue Wait (Detail)", + "description": "Dequeue wait time for ALL non-special job types at the selected quantile. Shows the complete picture of job queue waiting times. High wait times across many job types indicate systemic job queue congestion.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 65 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "{__name__=~\"rippled_(makeFetchPack_q|publishAcqLedger_q|untrustedValidation_q|manifest_q|localTransaction_q|ledgerReplayRequest_q|ledgerRequest_q|untrustedProposal_q|ledgerReplayTask_q|ledgerData_q|clientCommand_q|clientSubscribe_q|clientFeeChange_q|clientConsensus_q|clientAccountHistory_q|clientRPC_q|clientWebsocket_q|RPC_q|updatePaths_q|transaction_q|batch_q|advanceLedger_q|publishNewLedger_q|fetchTxnData_q|writeAhead_q|trustedValidation_q|writeObjects_q|acceptLedger_q|trustedProposal_q|sweep_q|clusterReport_q|heartbeat_q|administration_q|handleHaveTransactions_q|doTransactions_q)\", quantile=\"$quantile\", exported_instance=~\"$node\"}", + "legendFormat": "{{__name__}} [{{quantile}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Wait Time (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39,