diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 97a972c02f..587c8ba59d 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -656,117 +656,168 @@ ] }, { - "title": "Ledger Total Processing Time (Round Open -> Next Round Start)", - "description": "p95/p50 duration of the consensus.round span (full local round: open + establish + accept request) sourced from spanmetrics histograms — values are stable across refreshes because Prometheus rate() over a fixed time window is deterministic, unlike TraceQL search which pages through traces. Accepted vs Rejected apply rates derived from consensus.accept.apply spanmetrics partitioned by consensus_state (finished | moved_on | expired). Note: histogram bucket ceiling is currently 5s (otel-collector-config.yaml spanmetrics histogram.explicit.buckets) — durations longer than 5s land in the +Inf bucket and inflate p95.", - "type": "timeseries", + "title": "Consensus Outcome Distribution", + "description": "Distribution of consensus.accept outcomes: yes (normal), moved_on (without full agreement), expired (timeout). Non-yes outcomes indicate network stress.", + "type": "piechart", "gridPos": { "h": 8, - "w": 24, + "w": 8, "x": 0, "y": 64 }, - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "drawStyle": "line", - "lineInterpolation": "linear", - "pointSize": 4, - "showPoints": "auto", - "axisLabel": "Duration (ms)", - "spanNulls": true - } - }, - "overrides": [ - { - "matcher": { - "id": "byFrameRefID", - "options": "B" - }, - "properties": [ - { - "id": "displayName", - "value": "Accepted apply p95" - }, - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "green" - } - } - ] - }, - { - "matcher": { - "id": "byFrameRefID", - "options": "C" - }, - "properties": [ - { - "id": "displayName", - "value": "Rejected apply p95" - }, - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "red" - } - } - ] - } - ] - }, "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - }, "legend": { "displayMode": "table", - "placement": "bottom", - "calcs": ["mean", "max", "count"] + "placement": "right", + "values": ["value", "percent"] + }, + "tooltip": { + "mode": "multi" } }, "targets": [ { "datasource": { - "type": "prometheus", - "uid": "prometheus" + "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.round\"}[5m])))", - "legendFormat": "Round Total p95 [{{exported_instance}}]", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.round\"}[5m])))", - "legendFormat": "Round Total p50 [{{exported_instance}}]", - "refId": "D" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\", consensus_state=\"finished\"}[5m])))", - "legendFormat": "Accepted apply p95 [{{exported_instance}}]", - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\", consensus_state=~\"moved_on|expired\"}[5m])))", - "legendFormat": "Rejected apply p95 [{{exported_instance}}]", - "refId": "C" + "expr": "sum by (consensus_state) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state!=\"\"}[5m]))", + "legendFormat": "{{consensus_state}}" } - ] + ], + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + } + }, + { + "title": "Consensus Failures Over Time", + "description": "Rate of non-normal consensus outcomes (moved_on + expired). Spikes indicate consensus instability.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 64 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"moved_on\"}[5m]))", + "legendFormat": "moved_on [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"expired\"}[5m]))", + "legendFormat": "expired [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Failures / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Outcome Distribution", + "description": "Distribution of consensus.accept outcomes: yes (normal), moved_on (without full agreement), expired (timeout). Non-yes outcomes indicate network stress.", + "type": "piechart", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 72 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "values": ["value", "percent"] + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (consensus_state) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state!=\"\"}[5m]))", + "legendFormat": "{{consensus_state}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + } + }, + { + "title": "Consensus Failures Over Time", + "description": "Rate of non-normal consensus outcomes (moved_on + expired). Spikes indicate consensus instability.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 72 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"moved_on\"}[5m]))", + "legendFormat": "moved_on [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"expired\"}[5m]))", + "legendFormat": "expired [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Failures / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, @@ -885,6 +936,5 @@ "to": "now" }, "title": "Consensus Health", - "uid": "xrpld-consensus", - "refresh": "5s" + "uid": "xrpld-consensus" } diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json index 4e2af92992..85a9117dc8 100644 --- a/docker/telemetry/grafana/dashboards/rpc-performance.json +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -319,6 +319,96 @@ }, "overrides": [] } + }, + { + "title": "RPC Resource Cost by Command", + "description": "RPC commands grouped by load_type (resource cost category). High-cost categories like exception_rpc or malformed_rpc indicate problematic clients.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "lastNotNull"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (load_type) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"rpc.command.*\", load_type!=\"\"}[5m]))", + "legendFormat": "{{load_type}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Requests / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Batch vs Single RPC Requests", + "description": "Rate of batch RPC requests vs single requests. High batch rate may indicate bulk automation clients.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"rpc.process\", is_batch=\"true\"}[5m]))", + "legendFormat": "Batch [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"rpc.process\", is_batch=\"false\"}[5m]))", + "legendFormat": "Single [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Requests / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index 1da27cdc7a..83cf153b20 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -327,6 +327,174 @@ }, "overrides": [] } + }, + { + "title": "Transaction Rate by Type", + "description": "Transaction processing rate broken down by tx_type (Payment, OfferCreate, AMMDeposit, etc.). Requires tx_type dimension in spanmetrics.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "lastNotNull"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (tx_type) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\", tx_type!=\"\"}[5m]))", + "legendFormat": "{{tx_type}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "TX / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Transaction Results by Type", + "description": "Transaction result codes (ter_result) broken down by tx_type. Shows which transaction types fail most often.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "lastNotNull"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (tx_type, ter_result) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\", ter_result!=\"\", ter_result!=\"tesSUCCESS\"}[5m]))", + "legendFormat": "{{tx_type}}: {{ter_result}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Failed TX / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "TxQ Accept Status", + "description": "TxQ accept outcomes: applied (included in ledger), failed (removed), retried (kept for next round).", + "type": "piechart", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 40 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "values": ["value", "percent"] + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (txq_status) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.accept_tx\", txq_status!=\"\"}[5m]))", + "legendFormat": "{{txq_status}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + } + }, + { + "title": "Transactor Duration by Type (p95)", + "description": "Per-transactor execution time (tx.transactor span). Shows which transaction types are most expensive to execute.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 40 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, tx_type) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.transactor\", tx_type!=\"\"}[5m])))", + "legendFormat": "p95 {{tx_type}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index 4e32467cc1..656c7d0c8e 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -92,6 +92,11 @@ connectors: - name: suppressed - name: proposal_trusted - name: validation_trusted + - name: tx_type + - name: ter_result + - name: txq_status + - name: load_type + - name: is_batch exporters: debug: