diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index 2d4597533b..fecc7250cc 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -654,6 +654,88 @@ "refId": "A" } ] + }, + { + "title": "Consensus Outcome Distribution", + "description": "Distribution of consensus.accept outcomes: yes (normal), moved_on (without full agreement), expired (timeout). Non-yes outcomes indicate network stress.", + "type": "piechart", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 64 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "values": ["value", "percent"] + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (consensus_state) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state!=\"\"}[5m]))", + "legendFormat": "{{consensus_state}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + } + }, + { + "title": "Consensus Failures Over Time", + "description": "Rate of non-normal consensus outcomes (moved_on + expired). Spikes indicate consensus instability.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 64 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"moved_on\"}[5m]))", + "legendFormat": "moved_on [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"expired\"}[5m]))", + "legendFormat": "expired [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Failures / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json index a90983b2e8..a234c7ec9b 100644 --- a/docker/telemetry/grafana/dashboards/rpc-performance.json +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -319,6 +319,96 @@ }, "overrides": [] } + }, + { + "title": "RPC Resource Cost by Command", + "description": "RPC commands grouped by load_type (resource cost category). High-cost categories like exception_rpc or malformed_rpc indicate problematic clients.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "lastNotNull"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (load_type) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"rpc.command.*\", load_type!=\"\"}[5m]))", + "legendFormat": "{{load_type}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Requests / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Batch vs Single RPC Requests", + "description": "Rate of batch RPC requests vs single requests. High batch rate may indicate bulk automation clients.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"rpc.process\", is_batch=\"true\"}[5m]))", + "legendFormat": "Batch [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"rpc.process\", is_batch=\"false\"}[5m]))", + "legendFormat": "Single [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Requests / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, @@ -328,7 +418,7 @@ { "name": "node", "label": "Node", - "description": "Filter by rippled node (service.instance.id — e.g. Node-1)", + "description": "Filter by rippled node (service.instance.id \u2014 e.g. Node-1)", "type": "query", "query": "label_values(traces_span_metrics_calls_total, exported_instance)", "datasource": { diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index 0de4adf8a3..5c40c8b3fd 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -327,6 +327,174 @@ }, "overrides": [] } + }, + { + "title": "Transaction Rate by Type", + "description": "Transaction processing rate broken down by tx_type (Payment, OfferCreate, AMMDeposit, etc.). Requires tx_type dimension in spanmetrics.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "lastNotNull"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (tx_type) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\", tx_type!=\"\"}[5m]))", + "legendFormat": "{{tx_type}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "TX / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Transaction Results by Type", + "description": "Transaction result codes (ter_result) broken down by tx_type. Shows which transaction types fail most often.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "lastNotNull"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (tx_type, ter_result) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\", ter_result!=\"\", ter_result!=\"tesSUCCESS\"}[5m]))", + "legendFormat": "{{tx_type}}: {{ter_result}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Failed TX / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "TxQ Accept Status", + "description": "TxQ accept outcomes: applied (included in ledger), failed (removed), retried (kept for next round).", + "type": "piechart", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 40 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "values": ["value", "percent"] + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (txq_status) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.accept_tx\", txq_status!=\"\"}[5m]))", + "legendFormat": "{{txq_status}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + } + }, + { + "title": "Transactor Duration by Type (p95)", + "description": "Per-transactor execution time (tx.transactor span). Shows which transaction types are most expensive to execute.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 40 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, tx_type) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.transactor\", tx_type!=\"\"}[5m])))", + "legendFormat": "p95 {{tx_type}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } } ], "schemaVersion": 39, @@ -336,7 +504,7 @@ { "name": "node", "label": "Node", - "description": "Filter by rippled node (service.instance.id — e.g. Node-1)", + "description": "Filter by rippled node (service.instance.id \u2014 e.g. Node-1)", "type": "query", "query": "label_values(traces_span_metrics_calls_total, exported_instance)", "datasource": { diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index fb87638b14..08d83457b4 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -39,6 +39,12 @@ connectors: - name: suppressed - name: proposal_trusted - name: validation_trusted + - name: tx_type + - name: ter_result + - name: txq_status + - name: consensus_state + - name: load_type + - name: is_batch exporters: debug: