diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 318998718f..1b75ce86d0 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -742,6 +742,229 @@ }, "overrides": [] } + }, + { + "title": "Consensus Round Duration (Full Round)", + "description": "p95/p50 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 72 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))", + "legendFormat": "P95 Round [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))", + "legendFormat": "P50 Round [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Phase Duration (Open vs Establish)", + "description": "p95 duration of the open phase (transaction collection) vs the establish phase (proposal convergence). The consensus.phase.open and consensus.establish spans decompose round latency, so an operator can tell whether slowness is in collecting transactions or reaching agreement.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 72 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.phase.open\"}[5m])))", + "legendFormat": "P95 Open Phase [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.establish\"}[5m])))", + "legendFormat": "P95 Establish Phase [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Position Update Duration", + "description": "p95/p50 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 80 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))", + "legendFormat": "P95 Update [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))", + "legendFormat": "P50 Update [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Stall Rate", + "description": "Rate of consensus.check spans reporting consensus_stalled=true, broken down by stall flag. A non-zero stalled rate surfaces stall conditions before they manifest as validated-ledger-age alarms. Requires the consensus_stalled spanmetrics dimension.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 80 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.check\", consensus_stalled=\"true\"}[5m]))", + "legendFormat": "Stalled [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.check\", consensus_stalled=\"false\"}[5m]))", + "legendFormat": "Not Stalled [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Checks / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Mode-Change Rate by Target Mode", + "description": "Rate of consensus.mode_change spans broken down by the mode the node switched INTO (mode_new). Frequent switches into Wrong Ledger or Switched Ledger indicate an unstable node at fork risk. Requires the mode_new spanmetrics dimension.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 88 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (mode_new, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.mode_change\"}[5m]))", + "legendFormat": "{{mode_new}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Mode Changes / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index 8b11816959..8f08c6587c 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -506,6 +506,169 @@ }, "overrides": [] } + }, + { + "title": "TxQ Enqueue Rate by Transaction Type", + "description": "Rate of txq.enqueue spans broken down by transaction type (tx_type). Shows what share of inbound demand is Payment vs OfferCreate vs other transactors, and how the mix shifts as the queue fills. A spam burst of one type is a leading indicator of fee escalation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (tx_type, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.enqueue\"}[5m]))", + "legendFormat": "{{tx_type}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Enqueues / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Bypass Ratio (Direct Apply vs Enqueue)", + "description": "Ratio of transactions that applied directly to the open ledger (txq.apply_direct) versus those that had to be queued (txq.enqueue). A falling bypass ratio is the cleanest single signal the network has entered sustained fee escalation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.apply_direct\"}[5m])) / clamp_min(sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.apply_direct\"}[5m])) + sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.enqueue\"}[5m])), 1)", + "legendFormat": "Direct-Apply Fraction [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "custom": { + "axisLabel": "Bypass Fraction", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Accept (Drain) Duration per Ledger", + "description": "p95/p50 duration of the txq.accept span, which drains queued transactions into a newly closed ledger. Rising drain time signals queue pressure at ledger close.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))", + "legendFormat": "P95 Drain [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))", + "legendFormat": "P50 Drain [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Queue Cleanup Rate (Expired Entries)", + "description": "Rate of txq.cleanup spans, which remove expired transactions from the queue each ledger. A rising rate means submitters under-bid the escalating fee and abandoned their transactions \u2014 a demand-frustration signal distinct from acceptance throughput.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.cleanup\"}[5m]))", + "legendFormat": "Cleanups / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Cleanups / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index 60b01388db..36112253b8 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -63,6 +63,11 @@ connectors: - name: consensus_state - name: load_type - name: is_batch + # Consensus lifecycle dimensions (low cardinality, bounded value sets). + - name: mode_new + - name: consensus_stalled + - name: consensus_phase + - name: consensus_result exporters: debug: