Merge branch 'pratik/otel-phase9-metric-gap-fill' into pratik/otel-phase10-workload-validation

This commit is contained in:
Pratik Mankawde
2026-06-03 16:53:21 +01:00
4 changed files with 411 additions and 98 deletions

View File

@@ -656,117 +656,168 @@
]
},
{
"title": "Ledger Total Processing Time (Round Open -> Next Round Start)",
"description": "p95/p50 duration of the consensus.round span (full local round: open + establish + accept request) sourced from spanmetrics histograms — values are stable across refreshes because Prometheus rate() over a fixed time window is deterministic, unlike TraceQL search which pages through traces. Accepted vs Rejected apply rates derived from consensus.accept.apply spanmetrics partitioned by consensus_state (finished | moved_on | expired). Note: histogram bucket ceiling is currently 5s (otel-collector-config.yaml spanmetrics histogram.explicit.buckets) — durations longer than 5s land in the +Inf bucket and inflate p95.",
"type": "timeseries",
"title": "Consensus Outcome Distribution",
"description": "Distribution of consensus.accept outcomes: yes (normal), moved_on (without full agreement), expired (timeout). Non-yes outcomes indicate network stress.",
"type": "piechart",
"gridPos": {
"h": 8,
"w": 24,
"w": 8,
"x": 0,
"y": 64
},
"fieldConfig": {
"defaults": {
"unit": "ms",
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"pointSize": 4,
"showPoints": "auto",
"axisLabel": "Duration (ms)",
"spanNulls": true
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID",
"options": "B"
},
"properties": [
{
"id": "displayName",
"value": "Accepted apply p95"
},
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "green"
}
}
]
},
{
"matcher": {
"id": "byFrameRefID",
"options": "C"
},
"properties": [
{
"id": "displayName",
"value": "Rejected apply p95"
},
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "red"
}
}
]
}
]
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": ["mean", "max", "count"]
"placement": "right",
"values": ["value", "percent"]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.round\"}[5m])))",
"legendFormat": "Round Total p95 [{{exported_instance}}]",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.round\"}[5m])))",
"legendFormat": "Round Total p50 [{{exported_instance}}]",
"refId": "D"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\", consensus_state=\"finished\"}[5m])))",
"legendFormat": "Accepted apply p95 [{{exported_instance}}]",
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\", consensus_state=~\"moved_on|expired\"}[5m])))",
"legendFormat": "Rejected apply p95 [{{exported_instance}}]",
"refId": "C"
"expr": "sum by (consensus_state) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state!=\"\"}[5m]))",
"legendFormat": "{{consensus_state}}"
}
]
],
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
}
},
{
"title": "Consensus Failures Over Time",
"description": "Rate of non-normal consensus outcomes (moved_on + expired). Spikes indicate consensus instability.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 16,
"x": 8,
"y": 64
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"moved_on\"}[5m]))",
"legendFormat": "moved_on [{{exported_instance}}]"
},
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"expired\"}[5m]))",
"legendFormat": "expired [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Failures / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "Consensus Outcome Distribution",
"description": "Distribution of consensus.accept outcomes: yes (normal), moved_on (without full agreement), expired (timeout). Non-yes outcomes indicate network stress.",
"type": "piechart",
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 72
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"values": ["value", "percent"]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (consensus_state) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state!=\"\"}[5m]))",
"legendFormat": "{{consensus_state}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
}
},
{
"title": "Consensus Failures Over Time",
"description": "Rate of non-normal consensus outcomes (moved_on + expired). Spikes indicate consensus instability.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 16,
"x": 8,
"y": 72
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"moved_on\"}[5m]))",
"legendFormat": "moved_on [{{exported_instance}}]"
},
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\", consensus_state=\"expired\"}[5m]))",
"legendFormat": "expired [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Failures / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
}
],
"schemaVersion": 39,
@@ -885,6 +936,5 @@
"to": "now"
},
"title": "Consensus Health",
"uid": "xrpld-consensus",
"refresh": "5s"
"uid": "xrpld-consensus"
}

View File

@@ -319,6 +319,96 @@
},
"overrides": []
}
},
{
"title": "RPC Resource Cost by Command",
"description": "RPC commands grouped by load_type (resource cost category). High-cost categories like exception_rpc or malformed_rpc indicate problematic clients.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["mean", "lastNotNull"]
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (load_type) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"rpc.command.*\", load_type!=\"\"}[5m]))",
"legendFormat": "{{load_type}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Requests / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "Batch vs Single RPC Requests",
"description": "Rate of batch RPC requests vs single requests. High batch rate may indicate bulk automation clients.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 32
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"rpc.process\", is_batch=\"true\"}[5m]))",
"legendFormat": "Batch [{{exported_instance}}]"
},
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"rpc.process\", is_batch=\"false\"}[5m]))",
"legendFormat": "Single [{{exported_instance}}]"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Requests / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
}
],
"schemaVersion": 39,

View File

@@ -327,6 +327,174 @@
},
"overrides": []
}
},
{
"title": "Transaction Rate by Type",
"description": "Transaction processing rate broken down by tx_type (Payment, OfferCreate, AMMDeposit, etc.). Requires tx_type dimension in spanmetrics.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["mean", "lastNotNull"]
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (tx_type) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\", tx_type!=\"\"}[5m]))",
"legendFormat": "{{tx_type}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "TX / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "Transaction Results by Type",
"description": "Transaction result codes (ter_result) broken down by tx_type. Shows which transaction types fail most often.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 32
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["mean", "lastNotNull"]
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (tx_type, ter_result) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\", ter_result!=\"\", ter_result!=\"tesSUCCESS\"}[5m]))",
"legendFormat": "{{tx_type}}: {{ter_result}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"axisLabel": "Failed TX / Sec",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
},
{
"title": "TxQ Accept Status",
"description": "TxQ accept outcomes: applied (included in ledger), failed (removed), retried (kept for next round).",
"type": "piechart",
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 40
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"values": ["value", "percent"]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "sum by (txq_status) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.accept_tx\", txq_status!=\"\"}[5m]))",
"legendFormat": "{{txq_status}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
}
},
{
"title": "Transactor Duration by Type (p95)",
"description": "Per-transactor execution time (tx.transactor span). Shows which transaction types are most expensive to execute.",
"type": "timeseries",
"gridPos": {
"h": 8,
"w": 16,
"x": 8,
"y": 40
},
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": ["mean", "max"]
}
},
"targets": [
{
"datasource": {
"type": "prometheus"
},
"expr": "histogram_quantile(0.95, sum by (le, tx_type) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.transactor\", tx_type!=\"\"}[5m])))",
"legendFormat": "p95 {{tx_type}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "ms",
"custom": {
"axisLabel": "Duration (ms)",
"spanNulls": true,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 3
}
},
"overrides": []
}
}
],
"schemaVersion": 39,

View File

@@ -92,6 +92,11 @@ connectors:
- name: suppressed
- name: proposal_trusted
- name: validation_trusted
- name: tx_type
- name: ter_result
- name: txq_status
- name: load_type
- name: is_batch
exporters:
debug: