mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-04 01:06:48 +00:00
Merge branch 'pratik/otel-phase8-log-correlation' into pratik/otel-phase9-metric-gap-fill
Resolve consensus dashboard conflict and remove duplicate consensus_state dimension in collector config. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -656,117 +656,168 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Ledger Total Processing Time (Round Open -> Next Round Start)",
|
||||
"description": "p95/p50 duration of the consensus.round span (full local round: open + establish + accept request) sourced from spanmetrics histograms — values are stable across refreshes because Prometheus rate() over a fixed time window is deterministic, unlike TraceQL search which pages through traces. Accepted vs Rejected apply rates derived from consensus.accept.apply spanmetrics partitioned by consensus_state (finished | moved_on | expired). Note: histogram bucket ceiling is currently 5s (otel-collector-config.yaml spanmetrics histogram.explicit.buckets) — durations longer than 5s land in the +Inf bucket and inflate p95.",
|
||||
"type": "timeseries",
|
||||
"title": "Consensus Outcome Distribution",
|
||||
"description": "Distribution of consensus.accept outcomes: yes (normal), moved_on (without full agreement), expired (timeout). Non-yes outcomes indicate network stress.",
|
||||
"type": "piechart",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 64
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "linear",
|
||||
"pointSize": 4,
|
||||
"showPoints": "auto",
|
||||
"axisLabel": "Duration (ms)",
|
||||
"spanNulls": true
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byFrameRefID",
|
||||
"options": "B"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "displayName",
|
||||
"value": "Accepted apply p95"
|
||||
},
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"mode": "fixed",
|
||||
"fixedColor": "green"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byFrameRefID",
|
||||
"options": "C"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "displayName",
|
||||
"value": "Rejected apply p95"
|
||||
},
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"mode": "fixed",
|
||||
"fixedColor": "red"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": ["mean", "max", "count"]
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.round\"}[5m])))",
|
||||
"legendFormat": "Round Total p95 [{{exported_instance}}]",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.round\"}[5m])))",
|
||||
"legendFormat": "Round Total p50 [{{exported_instance}}]",
|
||||
"refId": "D"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\", consensus_state=\"finished\"}[5m])))",
|
||||
"legendFormat": "Accepted apply p95 [{{exported_instance}}]",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\", consensus_state=~\"moved_on|expired\"}[5m])))",
|
||||
"legendFormat": "Rejected apply p95 [{{exported_instance}}]",
|
||||
"refId": "C"
|
||||
"expr": "sum by (consensus_state) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state!=\"\"}[5m]))",
|
||||
"legendFormat": "{{consensus_state}}"
|
||||
}
|
||||
]
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Consensus Failures Over Time",
|
||||
"description": "Rate of non-normal consensus outcomes (moved_on + expired). Spikes indicate consensus instability.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 16,
|
||||
"x": 8,
|
||||
"y": 64
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"moved_on\"}[5m]))",
|
||||
"legendFormat": "moved_on [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"expired\"}[5m]))",
|
||||
"legendFormat": "expired [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Failures / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Consensus Outcome Distribution",
|
||||
"description": "Distribution of consensus.accept outcomes: yes (normal), moved_on (without full agreement), expired (timeout). Non-yes outcomes indicate network stress.",
|
||||
"type": "piechart",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 72
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (consensus_state) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state!=\"\"}[5m]))",
|
||||
"legendFormat": "{{consensus_state}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Consensus Failures Over Time",
|
||||
"description": "Rate of non-normal consensus outcomes (moved_on + expired). Spikes indicate consensus instability.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 16,
|
||||
"x": 8,
|
||||
"y": 72
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"moved_on\"}[5m]))",
|
||||
"legendFormat": "moved_on [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"expired\"}[5m]))",
|
||||
"legendFormat": "expired [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Failures / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
@@ -885,6 +936,5 @@
|
||||
"to": "now"
|
||||
},
|
||||
"title": "Consensus Health",
|
||||
"uid": "xrpld-consensus",
|
||||
"refresh": "5s"
|
||||
"uid": "xrpld-consensus"
|
||||
}
|
||||
|
||||
@@ -319,6 +319,96 @@
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "RPC Resource Cost by Command",
|
||||
"description": "RPC commands grouped by load_type (resource cost category). High-cost categories like exception_rpc or malformed_rpc indicate problematic clients.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (load_type) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"rpc.command.*\", load_type!=\"\"}[5m]))",
|
||||
"legendFormat": "{{load_type}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Requests / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Batch vs Single RPC Requests",
|
||||
"description": "Rate of batch RPC requests vs single requests. High batch rate may indicate bulk automation clients.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"rpc.process\", is_batch=\"true\"}[5m]))",
|
||||
"legendFormat": "Batch [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"rpc.process\", is_batch=\"false\"}[5m]))",
|
||||
"legendFormat": "Single [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Requests / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
|
||||
@@ -327,6 +327,174 @@
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Transaction Rate by Type",
|
||||
"description": "Transaction processing rate broken down by tx_type (Payment, OfferCreate, AMMDeposit, etc.). Requires tx_type dimension in spanmetrics.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (tx_type) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\", tx_type!=\"\"}[5m]))",
|
||||
"legendFormat": "{{tx_type}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "TX / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Transaction Results by Type",
|
||||
"description": "Transaction result codes (ter_result) broken down by tx_type. Shows which transaction types fail most often.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "lastNotNull"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (tx_type, ter_result) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\", ter_result!=\"\", ter_result!=\"tesSUCCESS\"}[5m]))",
|
||||
"legendFormat": "{{tx_type}}: {{ter_result}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Failed TX / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "TxQ Accept Status",
|
||||
"description": "TxQ accept outcomes: applied (included in ledger), failed (removed), retried (kept for next round).",
|
||||
"type": "piechart",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 40
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"values": ["value", "percent"]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (txq_status) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.accept_tx\", txq_status!=\"\"}[5m]))",
|
||||
"legendFormat": "{{txq_status}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Transactor Duration by Type (p95)",
|
||||
"description": "Per-transactor execution time (tx.transactor span). Shows which transaction types are most expensive to execute.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 16,
|
||||
"x": 8,
|
||||
"y": 40
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": ["mean", "max"]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, tx_type) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.transactor\", tx_type!=\"\"}[5m])))",
|
||||
"legendFormat": "p95 {{tx_type}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"axisLabel": "Duration (ms)",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
|
||||
@@ -92,6 +92,11 @@ connectors:
|
||||
- name: suppressed
|
||||
- name: proposal_trusted
|
||||
- name: validation_trusted
|
||||
- name: tx_type
|
||||
- name: ter_result
|
||||
- name: txq_status
|
||||
- name: load_type
|
||||
- name: is_batch
|
||||
|
||||
exporters:
|
||||
debug:
|
||||
|
||||
Reference in New Issue
Block a user