mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-06 10:16:45 +00:00
606 lines
18 KiB
JSON
606 lines
18 KiB
JSON
{
|
|
"annotations": {
|
|
"list": []
|
|
},
|
|
"description": "RPC and pathfinding metrics from beast::insight System Metrics. Requires [insight] server=otel in rippled config.",
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [],
|
|
"panels": [
|
|
{
|
|
"title": "RPC Request Rate (System Metrics)",
|
|
"description": "Rate of RPC requests as counted by the beast::insight counter. Sourced from rpc.requests (ServerHandler.cpp) which increments on every HTTP and WebSocket RPC request. Compare with the span-based rpc.request rate in the RPC Performance dashboard for cross-validation.",
|
|
"type": "stat",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 0
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "rate(xrpld_rpc_requests_total{exported_instance=~\"$node\"}[5m])",
|
|
"legendFormat": "Requests / Sec [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "reqps"
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "RPC Response Time (System Metrics)",
|
|
"description": "P95 of RPC response time from the beast::insight timer. Sourced from the rpc.time event (ServerHandler.cpp) which records elapsed milliseconds for each RPC response. This measures the full HTTP handler time, not just command execution. Compare with span-based rpc.request duration.",
|
|
"type": "timeseries",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 0
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
|
|
"legendFormat": "P95 Response Time [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ms",
|
|
"custom": {
|
|
"axisLabel": "Latency (ms)",
|
|
"spanNulls": true,
|
|
"insertNulls": false,
|
|
"showPoints": "auto",
|
|
"pointSize": 3
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "RPC Response Size",
|
|
"description": "P95 of RPC response payload size in bytes. Sourced from the rpc.size event (ServerHandler.cpp) which records the byte length of each RPC JSON response. Large responses may indicate expensive queries (e.g. account_tx with many results) or API misuse.",
|
|
"type": "timeseries",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 8
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_rpc_size_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
|
|
"legendFormat": "P95 Response Size [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "decbytes",
|
|
"custom": {
|
|
"axisLabel": "Size (Bytes)",
|
|
"spanNulls": true,
|
|
"insertNulls": false,
|
|
"showPoints": "auto",
|
|
"pointSize": 3
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "RPC Response Time Distribution",
|
|
"description": "Distribution of RPC response times from the beast::insight timer showing P90, P95, and P99 quantiles. Sourced from the rpc.time event (ServerHandler.cpp). Useful for detecting bimodal latency or long-tail requests.",
|
|
"type": "timeseries",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 8
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "histogram_quantile(0.9, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
|
|
"legendFormat": "P90 [{{exported_instance}}]"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
|
|
"legendFormat": "P95 [{{exported_instance}}]"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "histogram_quantile(0.99, sum by (le, exported_instance) (rate(xrpld_rpc_time_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
|
|
"legendFormat": "P99 [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ms",
|
|
"custom": {
|
|
"axisLabel": "Latency (ms)",
|
|
"spanNulls": true,
|
|
"insertNulls": false,
|
|
"showPoints": "auto",
|
|
"pointSize": 3
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "Pathfinding Fast Duration",
|
|
"description": "P95 of fast pathfinding execution time. Sourced from the pathfind_fast event (PathRequests.h) which records the duration of the fast pathfinding algorithm. Fast pathfinding uses a simplified search that trades accuracy for speed.",
|
|
"type": "timeseries",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 16
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_pathfind_fast_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
|
|
"legendFormat": "P95 Fast Pathfind [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ms",
|
|
"custom": {
|
|
"axisLabel": "Duration (ms)",
|
|
"spanNulls": true,
|
|
"insertNulls": false,
|
|
"showPoints": "auto",
|
|
"pointSize": 3
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "Pathfinding Full Duration",
|
|
"description": "P95 of full pathfinding execution time. Sourced from the pathfind_full event (PathRequests.h) which records the duration of the exhaustive pathfinding search. Full pathfinding is more expensive and can take significantly longer than fast mode.",
|
|
"type": "timeseries",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 16
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(xrpld_pathfind_full_milliseconds_bucket{exported_instance=~\"$node\"}[5m])))",
|
|
"legendFormat": "P95 Full Pathfind [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ms",
|
|
"custom": {
|
|
"axisLabel": "Duration (ms)",
|
|
"spanNulls": true,
|
|
"insertNulls": false,
|
|
"showPoints": "auto",
|
|
"pointSize": 3
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "Resource Warnings Rate",
|
|
"description": "Rate of resource warning events from the Resource Manager. Sourced from the warn meter (Logic.h) which increments when a consumer (peer or RPC client) exceeds the warning threshold for resource usage. A rising rate indicates aggressive clients that may need throttling. NOTE: This panel will show no data until the |m -> |c fix is applied in System MetricsCollector.cpp (Phase 6 Task 6.1).",
|
|
"type": "stat",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 24
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "rate(xrpld_warn_total{exported_instance=~\"$node\"}[5m])",
|
|
"legendFormat": "Warnings / Sec [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ops",
|
|
"thresholds": {
|
|
"steps": [
|
|
{
|
|
"color": "green",
|
|
"value": null
|
|
},
|
|
{
|
|
"color": "yellow",
|
|
"value": 0.1
|
|
},
|
|
{
|
|
"color": "red",
|
|
"value": 1
|
|
}
|
|
]
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "Resource Drops Rate",
|
|
"description": "Rate of resource drop events from the Resource Manager. Sourced from the drop meter (Logic.h) which increments when a consumer is disconnected or blocked due to excessive resource usage. Non-zero values mean the node is actively rejecting abusive connections. NOTE: This panel will show no data until the |m -> |c fix is applied in System MetricsCollector.cpp (Phase 6 Task 6.1).",
|
|
"type": "stat",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 24
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "rate(xrpld_drop_total{exported_instance=~\"$node\"}[5m])",
|
|
"legendFormat": "Drops / Sec [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ops",
|
|
"thresholds": {
|
|
"steps": [
|
|
{
|
|
"color": "green",
|
|
"value": null
|
|
},
|
|
{
|
|
"color": "yellow",
|
|
"value": 0.01
|
|
},
|
|
{
|
|
"color": "red",
|
|
"value": 0.1
|
|
}
|
|
]
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "gRPC Request Rate by Method (Spans)",
|
|
"description": "Per-method gRPC call rate derived from the grpc.{Method} spans (GRPCServer.cpp). Covers the gRPC API used by reporting/Clio. Populated only when the node serves gRPC traffic.",
|
|
"type": "timeseries",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 32
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "sum by (method, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", method=~\"$grpc_method\", span_name=~\"grpc\\\\..*\"}[5m]))",
|
|
"legendFormat": "{{method}} [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ops",
|
|
"custom": {
|
|
"axisLabel": "Calls / Sec",
|
|
"spanNulls": true,
|
|
"insertNulls": false,
|
|
"showPoints": "auto",
|
|
"pointSize": 3
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "gRPC Latency P95 by Method (Spans)",
|
|
"description": "p95 latency per gRPC method from grpc.{Method} span durations. Identifies slow gRPC read paths.",
|
|
"type": "timeseries",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 32
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "histogram_quantile(0.95, sum by (le, method, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", method=~\"$grpc_method\", span_name=~\"grpc\\\\..*\"}[5m])))",
|
|
"legendFormat": "{{method}} [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ms",
|
|
"custom": {
|
|
"axisLabel": "Duration (ms)",
|
|
"spanNulls": true,
|
|
"insertNulls": false,
|
|
"showPoints": "auto",
|
|
"pointSize": 3
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "gRPC Error Rate by Status (Spans)",
|
|
"description": "Rate of gRPC spans broken down by grpc_status (success/error/resource_exhausted/failed_precondition). A rising error or resource_exhausted rate indicates gRPC clients hitting limits.",
|
|
"type": "timeseries",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 40
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "sum by (grpc_status, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=~\"grpc\\\\..*\", grpc_status!=\"\"}[5m]))",
|
|
"legendFormat": "{{grpc_status}} [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ops",
|
|
"custom": {
|
|
"axisLabel": "Calls / Sec",
|
|
"spanNulls": true,
|
|
"insertNulls": false,
|
|
"showPoints": "auto",
|
|
"pointSize": 3
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "Pathfinding Compute Duration (Spans)",
|
|
"description": "p95 of the pathfind.compute span, the per-request path computation. Complements the StatsD pathfind_fast/full timers with span-level visibility. Populated under pathfinding (book/path) RPC load.",
|
|
"type": "timeseries",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 40
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"pathfind.compute\"}[5m])))",
|
|
"legendFormat": "P95 Compute [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ms",
|
|
"custom": {
|
|
"axisLabel": "Duration (ms)",
|
|
"spanNulls": true,
|
|
"insertNulls": false,
|
|
"showPoints": "auto",
|
|
"pointSize": 3
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
},
|
|
{
|
|
"title": "Pathfinding Request & Discovery Rate (Spans)",
|
|
"description": "Rate of pathfind.request (client path requests) and pathfind.discover (path-discovery passes) spans. Shows pathfinding demand and the discovery cost driver for subscription-heavy nodes.",
|
|
"type": "timeseries",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 24,
|
|
"x": 0,
|
|
"y": 48
|
|
},
|
|
"options": {
|
|
"tooltip": {
|
|
"mode": "multi",
|
|
"sort": "desc"
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"pathfind.request\"}[5m]))",
|
|
"legendFormat": "Requests / Sec [{{exported_instance}}]"
|
|
},
|
|
{
|
|
"datasource": {
|
|
"type": "prometheus"
|
|
},
|
|
"expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"pathfind.discover\"}[5m]))",
|
|
"legendFormat": "Discoveries / Sec [{{exported_instance}}]"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ops",
|
|
"custom": {
|
|
"axisLabel": "Operations / Sec",
|
|
"spanNulls": true,
|
|
"insertNulls": false,
|
|
"showPoints": "auto",
|
|
"pointSize": 3
|
|
}
|
|
},
|
|
"overrides": []
|
|
}
|
|
}
|
|
],
|
|
"schemaVersion": 39,
|
|
"tags": ["xrpld", "statsd", "rpc", "pathfinding"],
|
|
"templating": {
|
|
"list": [
|
|
{
|
|
"name": "node",
|
|
"label": "Node",
|
|
"description": "Filter by rippled node (service.instance.id)",
|
|
"type": "query",
|
|
"query": "label_values(xrpld_rpc_requests_total, exported_instance)",
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"includeAll": true,
|
|
"allValue": ".*",
|
|
"current": {
|
|
"text": "All",
|
|
"value": "$__all"
|
|
},
|
|
"multi": true,
|
|
"refresh": 2,
|
|
"sort": 1
|
|
},
|
|
{
|
|
"name": "grpc_method",
|
|
"label": "gRPC Method",
|
|
"description": "Filter by gRPC method (GetLedger, GetLedgerData, GetLedgerDiff, GetLedgerEntry)",
|
|
"type": "query",
|
|
"query": "label_values(traces_span_metrics_calls_total{span_name=~\"grpc\\\\..*\"}, method)",
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "prometheus"
|
|
},
|
|
"includeAll": true,
|
|
"allValue": ".*",
|
|
"current": {
|
|
"text": "All",
|
|
"value": "$__all"
|
|
},
|
|
"multi": true,
|
|
"refresh": 2,
|
|
"sort": 1
|
|
}
|
|
]
|
|
},
|
|
"time": {
|
|
"from": "now-1h",
|
|
"to": "now"
|
|
},
|
|
"title": "RPC & Pathfinding (System Metrics)",
|
|
"uid": "xrpld-system-rpc",
|
|
"refresh": "5s"
|
|
}
|