mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-04 17:27:00 +00:00
Merge branch 'pratik/otel-phase7-native-metrics' into pratik/otel-phase8-log-correlation
This commit is contained in:
@@ -29,14 +29,14 @@
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
|
||||
"legendFormat": "P95 Round Duration [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
|
||||
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept\"}[5m])))",
|
||||
"legendFormat": "P50 Round Duration [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
@@ -75,7 +75,7 @@
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.proposal.send\"}[5m]))",
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.proposal.send\"}[5m]))",
|
||||
"legendFormat": "Proposals / Sec [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
@@ -114,7 +114,7 @@
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m])))",
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m])))",
|
||||
"legendFormat": "P95 Close Duration [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
@@ -153,7 +153,7 @@
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.validation.send\"}[5m]))",
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.validation.send\"}[5m]))",
|
||||
"legendFormat": "Validations / Sec [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
@@ -179,14 +179,14 @@
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
|
||||
"legendFormat": "P95 Apply Duration [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
|
||||
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.accept.apply\"}[5m])))",
|
||||
"legendFormat": "P50 Apply Duration [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
@@ -219,7 +219,7 @@
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", xrpl_consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))",
|
||||
"expr": "sum by (close_time_correct, exported_instance) (rate(traces_span_metrics_calls_total{span_name=\"consensus.accept.apply\", consensus_mode=~\"$consensus_mode\", exported_instance=~\"$node\"}[$__rate_interval]))",
|
||||
"legendFormat": "Close Time Correct={{close_time_correct}} [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
@@ -258,8 +258,8 @@
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (xrpl_consensus_mode, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m]))",
|
||||
"legendFormat": "{{xrpl_consensus_mode}} [{{exported_instance}}]"
|
||||
"expr": "sum by (consensus_mode, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m]))",
|
||||
"legendFormat": "{{consensus_mode}} [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
@@ -742,6 +742,229 @@
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Consensus Round Duration (Full Round)",
|
||||
"description": "p95/p50 duration of the full consensus round. The consensus.round span (RCLConsensus.cpp startRound) wraps an entire round end-to-end. Filterable by consensus mode. This is the single most important consensus-health signal; rising round time precedes ledger-age alarms.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 72
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))",
|
||||
"legendFormat": "P95 Round [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", consensus_mode=~\"$consensus_mode\", span_name=\"consensus.round\"}[5m])))",
|
||||
"legendFormat": "P50 Round [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"axisLabel": "Duration (ms)",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Consensus Phase Duration (Open vs Establish)",
|
||||
"description": "p95 duration of the open phase (transaction collection) vs the establish phase (proposal convergence). The consensus.phase.open and consensus.establish spans decompose round latency, so an operator can tell whether slowness is in collecting transactions or reaching agreement.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 72
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.phase.open\"}[5m])))",
|
||||
"legendFormat": "P95 Open Phase [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.establish\"}[5m])))",
|
||||
"legendFormat": "P95 Establish Phase [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"axisLabel": "Duration (ms)",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Position Update Duration",
|
||||
"description": "p95/p50 duration of the consensus.update_positions span, which tallies disputes and updates this node's position each round. Long durations indicate heavy dispute resolution or slow convergence on close time.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 80
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))",
|
||||
"legendFormat": "P95 Update [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.5, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.update_positions\"}[5m])))",
|
||||
"legendFormat": "P50 Update [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"axisLabel": "Duration (ms)",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Consensus Stall Rate",
|
||||
"description": "Rate of consensus.check spans reporting consensus_stalled=true, broken down by stall flag. A non-zero stalled rate surfaces stall conditions before they manifest as validated-ledger-age alarms. Requires the consensus_stalled spanmetrics dimension.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 80
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.check\", consensus_stalled=\"true\"}[5m]))",
|
||||
"legendFormat": "Stalled [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.check\", consensus_stalled=\"false\"}[5m]))",
|
||||
"legendFormat": "Not Stalled [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Checks / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Consensus Mode-Change Rate by Target Mode",
|
||||
"description": "Rate of consensus.mode_change spans broken down by the mode the node switched INTO (mode_new). Frequent switches into Wrong Ledger or Switched Ledger indicate an unstable node at fork risk. Requires the mode_new spanmetrics dimension.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 88
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (mode_new, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.mode_change\"}[5m]))",
|
||||
"legendFormat": "{{mode_new}} [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Mode Changes / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
@@ -773,7 +996,7 @@
|
||||
"label": "Consensus Mode",
|
||||
"description": "Filter by consensus mode (Proposing, Observing, Wrong Ledger, Switched Ledger)",
|
||||
"type": "query",
|
||||
"query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, xrpl_consensus_mode)",
|
||||
"query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, consensus_mode)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
|
||||
@@ -380,6 +380,215 @@
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "gRPC Request Rate by Method (Spans)",
|
||||
"description": "Per-method gRPC call rate derived from the grpc.{Method} spans (GRPCServer.cpp). Covers the gRPC API used by reporting/Clio. Populated only when the node serves gRPC traffic.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (method, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", method=~\"$grpc_method\", span_name=~\"grpc\\\\..*\"}[5m]))",
|
||||
"legendFormat": "{{method}} [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Calls / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "gRPC Latency P95 by Method (Spans)",
|
||||
"description": "p95 latency per gRPC method from grpc.{Method} span durations. Identifies slow gRPC read paths.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, method, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", method=~\"$grpc_method\", span_name=~\"grpc\\\\..*\"}[5m])))",
|
||||
"legendFormat": "{{method}} [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"axisLabel": "Duration (ms)",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "gRPC Error Rate by Status (Spans)",
|
||||
"description": "Rate of gRPC spans broken down by grpc_status (success/error/resource_exhausted/failed_precondition). A rising error or resource_exhausted rate indicates gRPC clients hitting limits.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 40
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (grpc_status, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"grpc\\\\..*\", grpc_status!=\"\"}[5m]))",
|
||||
"legendFormat": "{{grpc_status}} [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Calls / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Pathfinding Compute Duration (Spans)",
|
||||
"description": "p95/p50 of the pathfind.compute span, the per-request path computation. Complements the StatsD pathfind_fast/full timers with span-level visibility. Populated under pathfinding (book/path) RPC load.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 40
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))",
|
||||
"legendFormat": "P95 Compute [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))",
|
||||
"legendFormat": "P50 Compute [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"axisLabel": "Duration (ms)",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Pathfinding Request & Discovery Rate (Spans)",
|
||||
"description": "Rate of pathfind.request (client path requests) and pathfind.discover (path-discovery passes) spans. Shows pathfinding demand and the discovery cost driver for subscription-heavy nodes.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 48
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"pathfind.request\"}[5m]))",
|
||||
"legendFormat": "Requests / Sec [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"pathfind.discover\"}[5m]))",
|
||||
"legendFormat": "Discoveries / Sec [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Operations / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
@@ -405,6 +614,26 @@
|
||||
"multi": true,
|
||||
"refresh": 2,
|
||||
"sort": 1
|
||||
},
|
||||
{
|
||||
"name": "grpc_method",
|
||||
"label": "gRPC Method",
|
||||
"description": "Filter by gRPC method (GetLedger, GetLedgerData, GetLedgerDiff, GetLedgerEntry)",
|
||||
"type": "query",
|
||||
"query": "label_values(traces_span_metrics_calls_total{span_name=~\"grpc\\\\..*\"}, method)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"includeAll": true,
|
||||
"allValue": ".*",
|
||||
"current": {
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"multi": true,
|
||||
"refresh": 2,
|
||||
"sort": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@@ -506,6 +506,169 @@
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "TxQ Enqueue Rate by Transaction Type",
|
||||
"description": "Rate of txq.enqueue spans broken down by transaction type (tx_type). Shows what share of inbound demand is Payment vs OfferCreate vs other transactors, and how the mix shifts as the queue fills. A spam burst of one type is a leading indicator of fee escalation.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 48
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (tx_type, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.enqueue\"}[5m]))",
|
||||
"legendFormat": "{{tx_type}} [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Enqueues / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Queue Bypass Ratio (Direct Apply vs Enqueue)",
|
||||
"description": "Ratio of transactions that applied directly to the open ledger (txq.apply_direct) versus those that had to be queued (txq.enqueue). A falling bypass ratio is the cleanest single signal the network has entered sustained fee escalation.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 48
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.apply_direct\"}[5m])) / clamp_min(sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.apply_direct\"}[5m])) + sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.enqueue\"}[5m])), 1)",
|
||||
"legendFormat": "Direct-Apply Fraction [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
"custom": {
|
||||
"axisLabel": "Bypass Fraction",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Queue Accept (Drain) Duration per Ledger",
|
||||
"description": "p95/p50 duration of the txq.accept span, which drains queued transactions into a newly closed ledger. Rising drain time signals queue pressure at ledger close.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 56
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))",
|
||||
"legendFormat": "P95 Drain [{{exported_instance}}]"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"txq.accept\"}[5m])))",
|
||||
"legendFormat": "P50 Drain [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"axisLabel": "Duration (ms)",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Queue Cleanup Rate (Expired Entries)",
|
||||
"description": "Rate of txq.cleanup spans, which remove expired transactions from the queue each ledger. A rising rate means submitters under-bid the escalating fee and abandoned their transactions \u2014 a demand-frustration signal distinct from acceptance throughput.",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 56
|
||||
},
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"txq.cleanup\"}[5m]))",
|
||||
"legendFormat": "Cleanups / Sec [{{exported_instance}}]"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"axisLabel": "Cleanups / Sec",
|
||||
"spanNulls": true,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 3
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
|
||||
@@ -81,7 +81,7 @@ connectors:
|
||||
dimensions:
|
||||
- name: command
|
||||
- name: rpc_status
|
||||
- name: xrpl.consensus.mode
|
||||
- name: consensus_mode
|
||||
- name: close_time_correct
|
||||
- name: local
|
||||
- name: suppressed
|
||||
@@ -93,6 +93,15 @@ connectors:
|
||||
- name: consensus_state
|
||||
- name: load_type
|
||||
- name: is_batch
|
||||
# Consensus lifecycle dimensions (low cardinality, bounded value sets).
|
||||
- name: mode_new
|
||||
- name: consensus_stalled
|
||||
- name: consensus_phase
|
||||
- name: consensus_result
|
||||
# gRPC surface dimensions (bounded: method names, role, status).
|
||||
- name: method
|
||||
- name: grpc_role
|
||||
- name: grpc_status
|
||||
|
||||
exporters:
|
||||
debug:
|
||||
|
||||
@@ -75,47 +75,47 @@ All spans instrumented in xrpld, grouped by subsystem:
|
||||
|
||||
### Transaction Spans (Phase 3)
|
||||
|
||||
| Span Name | Source File | Attributes | Description |
|
||||
| ------------ | --------------- | -------------------------------------------------------------------------------------- | ------------------------------------- |
|
||||
| `tx.process` | NetworkOPs.cpp | `xrpl.tx.hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing |
|
||||
| `tx.receive` | PeerImp.cpp | `xrpl.peer.id`, `xrpl.tx.hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay |
|
||||
| `tx.apply` | BuildLedger.cpp | `xrpl.ledger.seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger |
|
||||
| Span Name | Source File | Attributes | Description |
|
||||
| ------------ | --------------- | --------------------------------------------------------------------------------- | ------------------------------------- |
|
||||
| `tx.process` | NetworkOPs.cpp | `tx_hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing |
|
||||
| `tx.receive` | PeerImp.cpp | `peer_id`, `tx_hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay |
|
||||
| `tx.apply` | BuildLedger.cpp | `ledger_seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger |
|
||||
|
||||
### Transaction Queue Spans (Phase 3)
|
||||
|
||||
| Span Name | Source File | Attributes | Description |
|
||||
| ------------------ | ----------- | ------------------------------------------------------------- | -------------------------------------------------- |
|
||||
| `txq.enqueue` | TxQ.cpp | `xrpl.tx.hash`, `tx_type` | Transaction enqueue decision (child of tx.process) |
|
||||
| `txq.apply_direct` | TxQ.cpp | -- | Direct apply attempt (bypassing queue) |
|
||||
| `txq.batch_clear` | TxQ.cpp | -- | Batch clear of queued transactions for an account |
|
||||
| `txq.accept` | TxQ.cpp | `queue_size`, `ledger_changed` | Ledger-close accept loop over queued transactions |
|
||||
| `txq.accept_tx` | TxQ.cpp | `xrpl.tx.hash`, `retries_remaining`, `ter_code`, `txq_status` | Per-transaction apply during accept |
|
||||
| `txq.cleanup` | TxQ.cpp | `xrpl.ledger.seq` | Post-close cleanup of expired queue entries |
|
||||
| Span Name | Source File | Attributes | Description |
|
||||
| ------------------ | ----------- | -------------------------------------------------------- | -------------------------------------------------- |
|
||||
| `txq.enqueue` | TxQ.cpp | `tx_hash`, `tx_type` | Transaction enqueue decision (child of tx.process) |
|
||||
| `txq.apply_direct` | TxQ.cpp | -- | Direct apply attempt (bypassing queue) |
|
||||
| `txq.batch_clear` | TxQ.cpp | -- | Batch clear of queued transactions for an account |
|
||||
| `txq.accept` | TxQ.cpp | `queue_size`, `ledger_changed` | Ledger-close accept loop over queued transactions |
|
||||
| `txq.accept_tx` | TxQ.cpp | `tx_hash`, `retries_remaining`, `ter_code`, `txq_status` | Per-transaction apply during accept |
|
||||
| `txq.cleanup` | TxQ.cpp | `ledger_seq` | Post-close cleanup of expired queue entries |
|
||||
|
||||
### Consensus Spans (Phase 4)
|
||||
|
||||
| Span Name | Source File | Attributes | Description |
|
||||
| ------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `consensus.round` | RCLConsensus.cpp | `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`, `xrpl.consensus.mode`, `trace_strategy`, `xrpl.consensus.round_id` | Root span for a consensus round (deterministic or random trace ID) |
|
||||
| `consensus.phase.open` | Consensus.h | -- | Open phase duration (child of round) |
|
||||
| `consensus.proposal.send` | RCLConsensus.cpp | `xrpl.consensus.round`, `is_bow_out` | Consensus proposal broadcast |
|
||||
| `consensus.ledger_close` | RCLConsensus.cpp | `xrpl.ledger.seq`, `xrpl.consensus.mode` | Ledger close event |
|
||||
| `consensus.establish` | Consensus.h | `converge_percent`, `establish_count`, `proposers` | Establish phase duration (child of round) |
|
||||
| `consensus.update_positions` | Consensus.h | `converge_percent`, `proposers`, `disputes_count` | Position update and dispute resolution (see Events below) |
|
||||
| `consensus.check` | Consensus.h | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `consensus_result` | Consensus threshold check |
|
||||
| `consensus.accept` | RCLConsensus.cpp | `proposers`, `round_time_ms`, `quorum`, `disputes_count`, `consensus_state` | Ledger accepted by consensus |
|
||||
| `consensus.accept.apply` | RCLConsensus.cpp | `xrpl.ledger.seq`, `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction`, `tx_count` | Ledger application with close time details (see Events below) |
|
||||
| `consensus.validation.send` | RCLConsensus.cpp | `xrpl.ledger.seq`, `proposing`, `ledger_hash`, `full_validation`, `validation_sign_time` | Validation sent after accept (follows-from link) |
|
||||
| `consensus.mode_change` | RCLConsensus.cpp | `mode_old`, `mode_new` | Consensus mode transition |
|
||||
| `consensus.proposal.receive` | PeerImp.cpp | `trusted`, `xrpl.consensus.round` | Proposal received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) |
|
||||
| `consensus.validation.receive` | PeerImp.cpp | `trusted`, `xrpl.ledger.seq` | Validation received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) |
|
||||
| Span Name | Source File | Attributes | Description |
|
||||
| ------------------------------ | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `consensus.round` | RCLConsensus.cpp | `consensus_ledger_id`, `ledger_seq`, `consensus_mode`, `trace_strategy`, `consensus_round_id` | Root span for a consensus round (deterministic or random trace ID) |
|
||||
| `consensus.phase.open` | Consensus.h | -- | Open phase duration (child of round) |
|
||||
| `consensus.proposal.send` | RCLConsensus.cpp | `consensus_round`, `is_bow_out` | Consensus proposal broadcast |
|
||||
| `consensus.ledger_close` | RCLConsensus.cpp | `ledger_seq`, `consensus_mode` | Ledger close event |
|
||||
| `consensus.establish` | Consensus.h | `converge_percent`, `establish_count`, `proposers` | Establish phase duration (child of round) |
|
||||
| `consensus.update_positions` | Consensus.h | `converge_percent`, `proposers`, `disputes_count` | Position update and dispute resolution (see Events below) |
|
||||
| `consensus.check` | Consensus.h | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `consensus_result` | Consensus threshold check |
|
||||
| `consensus.accept` | RCLConsensus.cpp | `proposers`, `round_time_ms`, `quorum`, `disputes_count`, `consensus_state` | Ledger accepted by consensus |
|
||||
| `consensus.accept.apply` | RCLConsensus.cpp | `ledger_seq`, `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction`, `tx_count` | Ledger application with close time details (see Events below) |
|
||||
| `consensus.validation.send` | RCLConsensus.cpp | `ledger_seq`, `proposing`, `ledger_hash`, `full_validation`, `validation_sign_time` | Validation sent after accept (follows-from link) |
|
||||
| `consensus.mode_change` | RCLConsensus.cpp | `mode_old`, `mode_new` | Consensus mode transition |
|
||||
| `consensus.proposal.receive` | PeerImp.cpp | `trusted`, `consensus_round` | Proposal received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) |
|
||||
| `consensus.validation.receive` | PeerImp.cpp | `trusted`, `ledger_seq` | Validation received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) |
|
||||
|
||||
#### Consensus Span Events
|
||||
|
||||
| Parent Span | Event Name | Event Attributes | Description |
|
||||
| ---------------------------- | ----------------- | ---------------------------------------------------------------- | ------------------------------------------------------- |
|
||||
| `consensus.update_positions` | `dispute.resolve` | `xrpl.tx.id`, `dispute_our_vote`, `dispute_yays`, `dispute_nays` | Emitted per dispute when votes are tallied |
|
||||
| `consensus.accept.apply` | `tx.included` | `xrpl.tx.id` | Emitted per transaction included in the accepted ledger |
|
||||
| Parent Span | Event Name | Event Attributes | Description |
|
||||
| ---------------------------- | ----------------- | ----------------------------------------------------------- | ------------------------------------------------------- |
|
||||
| `consensus.update_positions` | `dispute.resolve` | `tx_id`, `dispute_our_vote`, `dispute_yays`, `dispute_nays` | Emitted per dispute when votes are tallied |
|
||||
| `consensus.accept.apply` | `tx.included` | `tx_id` | Emitted per transaction included in the accepted ledger |
|
||||
|
||||
#### Close Time Queries (Tempo TraceQL)
|
||||
|
||||
@@ -130,10 +130,10 @@ All spans instrumented in xrpld, grouped by subsystem:
|
||||
{name="consensus.accept.apply"} | duration > 5s
|
||||
|
||||
# Find specific ledger's consensus details
|
||||
{name="consensus.accept.apply"} | xrpl.ledger.seq = 92345678
|
||||
{name="consensus.accept.apply"} | ledger_seq = 92345678
|
||||
|
||||
# Find all spans in a consensus round (deterministic trace strategy)
|
||||
{name="consensus.round"} | xrpl.consensus.round_id = "<round_id>"
|
||||
{name="consensus.round"} | consensus_round_id = <round_id>
|
||||
|
||||
# Find dispute resolutions
|
||||
{name="consensus.update_positions"} >> {event:name="dispute.resolve"}
|
||||
@@ -141,18 +141,18 @@ All spans instrumented in xrpld, grouped by subsystem:
|
||||
|
||||
### Ledger Spans (Phase 6)
|
||||
|
||||
| Span Name | Source File | Attributes | Description |
|
||||
| ----------------- | -------------------- | ------------------------------------------ | ----------------------------- |
|
||||
| `ledger.build` | BuildLedger.cpp:31 | `xrpl.ledger.seq`, `tx_count`, `tx_failed` | Ledger build during consensus |
|
||||
| `ledger.validate` | LedgerMaster.cpp:915 | `xrpl.ledger.seq`, `validations` | Ledger promoted to validated |
|
||||
| `ledger.store` | LedgerMaster.cpp:409 | `xrpl.ledger.seq` | Ledger stored in history |
|
||||
| Span Name | Source File | Attributes | Description |
|
||||
| ----------------- | -------------------- | ------------------------------------- | ----------------------------- |
|
||||
| `ledger.build` | BuildLedger.cpp:31 | `ledger_seq`, `tx_count`, `tx_failed` | Ledger build during consensus |
|
||||
| `ledger.validate` | LedgerMaster.cpp:915 | `ledger_seq`, `validations` | Ledger promoted to validated |
|
||||
| `ledger.store` | LedgerMaster.cpp:409 | `ledger_seq` | Ledger stored in history |
|
||||
|
||||
### Peer Spans (Phase 6)
|
||||
|
||||
| Span Name | Source File | Attributes | Description |
|
||||
| ------------------------- | ---------------- | ------------------------------------ | ----------------------------- |
|
||||
| `peer.proposal.receive` | PeerImp.cpp:1667 | `xrpl.peer.id`, `proposal_trusted` | Proposal received from peer |
|
||||
| `peer.validation.receive` | PeerImp.cpp:2264 | `xrpl.peer.id`, `validation_trusted` | Validation received from peer |
|
||||
| Span Name | Source File | Attributes | Description |
|
||||
| ------------------------- | ---------------- | ------------------------------- | ----------------------------- |
|
||||
| `peer.proposal.receive` | PeerImp.cpp:1667 | `peer_id`, `proposal_trusted` | Proposal received from peer |
|
||||
| `peer.validation.receive` | PeerImp.cpp:2264 | `peer_id`, `validation_trusted` | Validation received from peer |
|
||||
|
||||
---
|
||||
|
||||
@@ -210,7 +210,7 @@ This section shows what questions you can answer using the span attributes, with
|
||||
{name="rpc.http_request"} | request_payload_size > 100000
|
||||
|
||||
# Find resource-heavy RPC commands (by load_type)
|
||||
{name=~"rpc.command.*"} | load_type = "exception_rpc"
|
||||
{name=~"rpc.command.*"} | load_type = "exceptioned RPC"
|
||||
|
||||
# Find a specific WebSocket command
|
||||
{name="rpc.ws_message"} | command = "subscribe"
|
||||
@@ -356,10 +356,10 @@ all its normal attributes, it just lacks a cross-node parent link.
|
||||
{name="consensus.proposal.receive"} && nestedSetParent > 0
|
||||
|
||||
# Trace a transaction across the network by its hash
|
||||
{name=~"tx\\..*"} | xrpl.tx.hash = "<hash>"
|
||||
{name=~"tx\\..*"} | tx_hash = "<hash>"
|
||||
|
||||
# Find all spans in a cross-node consensus trace
|
||||
{rootServiceName="xrpld"} | xrpl.consensus.round_id = "<round_id>"
|
||||
{rootServiceName="xrpld"} | consensus_round_id = <round_id>
|
||||
|
||||
# Compare latency between sender and receiver for validations
|
||||
{name="consensus.validation.send" || name="consensus.validation.receive"}
|
||||
@@ -389,16 +389,16 @@ Every metric carries these standard labels:
|
||||
| `service_name` | Resource attribute | `xrpld` |
|
||||
| `span_kind` | Span kind | `SPAN_KIND_INTERNAL` |
|
||||
|
||||
Additionally, span attributes configured as dimensions in the collector become metric labels (dots → underscores):
|
||||
Additionally, span attributes configured as dimensions in the collector become metric labels. The collector dimensions use the bare attribute keys emitted by the code, so the label name equals the attribute name:
|
||||
|
||||
| Span Attribute | Metric Label | Applies To |
|
||||
| --------------------- | ------------------------------ | ------------------------------- |
|
||||
| `command` | `xrpl_rpc_command` | `rpc.command.*` spans |
|
||||
| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` spans |
|
||||
| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` spans |
|
||||
| `local` | `xrpl_tx_local` | `tx.process` spans |
|
||||
| `proposal_trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` spans |
|
||||
| `validation_trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` spans |
|
||||
| Span Attribute | Metric Label | Applies To |
|
||||
| -------------------- | -------------------- | ------------------------------- |
|
||||
| `command` | `command` | `rpc.command.*` spans |
|
||||
| `rpc_status` | `rpc_status` | `rpc.command.*` spans |
|
||||
| `consensus_mode` | `consensus_mode` | `consensus.ledger_close` spans |
|
||||
| `local` | `local` | `tx.process` spans |
|
||||
| `proposal_trusted` | `proposal_trusted` | `peer.proposal.receive` spans |
|
||||
| `validation_trusted` | `validation_trusted` | `peer.validation.receive` spans |
|
||||
|
||||
### Histogram Buckets
|
||||
|
||||
@@ -470,44 +470,44 @@ Ten dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`:
|
||||
|
||||
### RPC Performance (`xrpld-rpc-perf`)
|
||||
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| --------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------- |
|
||||
| RPC Request Rate by Command | timeseries | `sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~"rpc.command.*"}[5m]))` | `xrpl_rpc_command` |
|
||||
| RPC Latency p95 by Command | timeseries | `histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])))` | `xrpl_rpc_command` |
|
||||
| RPC Error Rate | bargauge | Error spans / total spans × 100, grouped by `xrpl_rpc_command` | `xrpl_rpc_command`, `status_code` |
|
||||
| RPC Latency Heatmap | heatmap | `sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])) by (le)` | `le` (bucket boundaries) |
|
||||
| Overall RPC Throughput | timeseries | `rpc.request` + `rpc.process` rate | — |
|
||||
| RPC Success vs Error | timeseries | by `status_code` (UNSET vs ERROR) | `status_code` |
|
||||
| Top Commands by Volume | bargauge | `topk(10, ...)` by `xrpl_rpc_command` | `xrpl_rpc_command` |
|
||||
| WebSocket Message Rate | stat | `rpc.ws_message` rate | — |
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| --------------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------ |
|
||||
| RPC Request Rate by Command | timeseries | `sum by (command) (rate(traces_span_metrics_calls_total{span_name=~"rpc.command.*"}[5m]))` | `command` |
|
||||
| RPC Latency p95 by Command | timeseries | `histogram_quantile(0.95, sum by (le, command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])))` | `command` |
|
||||
| RPC Error Rate | bargauge | Error spans / total spans × 100, grouped by `command` | `command`, `status_code` |
|
||||
| RPC Latency Heatmap | heatmap | `sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])) by (le)` | `le` (bucket boundaries) |
|
||||
| Overall RPC Throughput | timeseries | `rpc.request` + `rpc.process` rate | — |
|
||||
| RPC Success vs Error | timeseries | by `status_code` (UNSET vs ERROR) | `status_code` |
|
||||
| Top Commands by Volume | bargauge | `topk(10, ...)` by `command` | `command` |
|
||||
| WebSocket Message Rate | stat | `rpc.ws_message` rate | — |
|
||||
|
||||
### Transaction Overview (`xrpld-transactions`)
|
||||
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| --------------------------------- | ---------- | -------------------------------------------------------------------------------------------- | --------------- |
|
||||
| Transaction Processing Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m])` and `tx.receive` | `span_name` |
|
||||
| Transaction Processing Latency | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="tx.process"})` | — |
|
||||
| Transaction Path Distribution | piechart | `sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m]))` | `xrpl_tx_local` |
|
||||
| Transaction Receive vs Suppressed | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.receive"}[5m])` | — |
|
||||
| TX Processing Duration Heatmap | heatmap | `tx.process` histogram buckets | `le` |
|
||||
| TX Apply Duration per Ledger | timeseries | p95/p50 of `tx.apply` | — |
|
||||
| Peer TX Receive Rate | timeseries | `tx.receive` rate | — |
|
||||
| TX Apply Failed Rate | stat | `tx.apply` with `STATUS_CODE_ERROR` | `status_code` |
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| --------------------------------- | ---------- | ------------------------------------------------------------------------------------ | ------------- |
|
||||
| Transaction Processing Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m])` and `tx.receive` | `span_name` |
|
||||
| Transaction Processing Latency | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="tx.process"})` | — |
|
||||
| Transaction Path Distribution | piechart | `sum by (local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m]))` | `local` |
|
||||
| Transaction Receive vs Suppressed | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.receive"}[5m])` | — |
|
||||
| TX Processing Duration Heatmap | heatmap | `tx.process` histogram buckets | `le` |
|
||||
| TX Apply Duration per Ledger | timeseries | p95/p50 of `tx.apply` | — |
|
||||
| Peer TX Receive Rate | timeseries | `tx.receive` rate | — |
|
||||
| TX Apply Failed Rate | stat | `tx.apply` with `STATUS_CODE_ERROR` | `status_code` |
|
||||
|
||||
### Consensus Health (`xrpld-consensus`)
|
||||
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | --------------------- |
|
||||
| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — |
|
||||
| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — |
|
||||
| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — |
|
||||
| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — |
|
||||
| Ledger Apply Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept.apply"})` | — |
|
||||
| Close Time Agreement | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.accept.apply"}[5m])` | — |
|
||||
| Consensus Mode Over Time | timeseries | `consensus.ledger_close` by `xrpl_consensus_mode` | `xrpl_consensus_mode` |
|
||||
| Accept vs Close Rate | timeseries | `consensus.accept` vs `consensus.ledger_close` rate | — |
|
||||
| Validation vs Close Rate | timeseries | `consensus.validation.send` vs `consensus.ledger_close` | — |
|
||||
| Accept Duration Heatmap | heatmap | `consensus.accept` histogram buckets | `le` |
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | ---------------- |
|
||||
| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — |
|
||||
| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — |
|
||||
| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — |
|
||||
| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — |
|
||||
| Ledger Apply Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept.apply"})` | — |
|
||||
| Close Time Agreement | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.accept.apply"}[5m])` | — |
|
||||
| Consensus Mode Over Time | timeseries | `consensus.ledger_close` by `consensus_mode` | `consensus_mode` |
|
||||
| Accept vs Close Rate | timeseries | `consensus.accept` vs `consensus.ledger_close` rate | — |
|
||||
| Validation vs Close Rate | timeseries | `consensus.validation.send` vs `consensus.ledger_close` | — |
|
||||
| Accept Duration Heatmap | heatmap | `consensus.accept` histogram buckets | `le` |
|
||||
|
||||
### Ledger Operations (`xrpld-ledger-ops`)
|
||||
|
||||
@@ -526,12 +526,12 @@ Ten dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`:
|
||||
|
||||
Requires `trace_peer=1` in the `[telemetry]` config section.
|
||||
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| -------------------------------- | ---------- | --------------------------------- | ------------------------------ |
|
||||
| Proposal Receive Rate | timeseries | `peer.proposal.receive` rate | — |
|
||||
| Validation Receive Rate | timeseries | `peer.validation.receive` rate | — |
|
||||
| Proposals Trusted vs Untrusted | piechart | by `xrpl_peer_proposal_trusted` | `xrpl_peer_proposal_trusted` |
|
||||
| Validations Trusted vs Untrusted | piechart | by `xrpl_peer_validation_trusted` | `xrpl_peer_validation_trusted` |
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| -------------------------------- | ---------- | ------------------------------ | -------------------- |
|
||||
| Proposal Receive Rate | timeseries | `peer.proposal.receive` rate | — |
|
||||
| Validation Receive Rate | timeseries | `peer.validation.receive` rate | — |
|
||||
| Proposals Trusted vs Untrusted | piechart | by `proposal_trusted` | `proposal_trusted` |
|
||||
| Validations Trusted vs Untrusted | piechart | by `validation_trusted` | `validation_trusted` |
|
||||
|
||||
### Node Health -- System Metrics (`xrpld-system-node-health`)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user