mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-02 16:26:48 +00:00
fix(telemetry): fix CI failures in phase-6 build, clang-tidy, and rename checks
Build fixes in PeerImp.cpp: - Rename duplicate `span` variable to `consSpan` in proposal and validation handlers to avoid redefinition error - Fix `->` on non-pointer SpanGuard (now correctly on shared_ptr) - Fix move-only type copy in lambda capture Clang-tidy fixes: - Concatenate nested namespaces in LedgerSpanNames.h and PeerSpanNames.h - Add missing SpanNames.h includes in BuildLedger.cpp, LedgerMaster.cpp, PeerImp.cpp for direct seg:: symbol usage - Add missing <chrono> and <cstdint> includes in BuildLedger.cpp - Remove unused Feature.h include from BuildLedger.cpp Rename check fix: - Run docs.sh to rename rippled_ metric prefixes to xrpld_ in 09-data-collection-reference.md and telemetry-runbook.md Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -42,7 +42,7 @@ graph LR
|
||||
BP -->|"OTLP/gRPC :4317"| D
|
||||
|
||||
SM -->|"span_calls_total<br/>span_duration_ms<br/>(6 dimension labels)"| E
|
||||
R2 -->|"rippled_* gauges<br/>rippled_* counters<br/>rippled_* summaries"| E
|
||||
R2 -->|"xrpld_* gauges<br/>xrpld_* counters<br/>xrpld_* summaries"| E
|
||||
|
||||
E -->|"Prometheus<br/>data source"| F
|
||||
D -->|"Tempo<br/>data source"| F
|
||||
@@ -400,59 +400,59 @@ These are system-level metrics emitted by xrpld's `beast::insight` framework via
|
||||
[insight]
|
||||
server=statsd
|
||||
address=127.0.0.1:8125
|
||||
prefix=rippled
|
||||
prefix=xrpld
|
||||
```
|
||||
|
||||
> **Note**: The `prefix` value is user-configurable — all metric names in the tables below assume `prefix=rippled` (matching the integration test and Grafana dashboards). If you change the prefix, replace `rippled_` with `{your_prefix}_` in all PromQL queries.
|
||||
> **Note**: The `prefix` value is user-configurable — all metric names in the tables below assume `prefix=xrpld` (matching the integration test and Grafana dashboards). If you change the prefix, replace `xrpld_` with `{your_prefix}_` in all PromQL queries.
|
||||
|
||||
### 2.1 Gauges
|
||||
|
||||
| Prometheus Metric | Source File | Description | Typical Range |
|
||||
| --------------------------------------------------- | --------------------- | ---------------------------------------- | ------------------------------- |
|
||||
| `rippled_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h | Seconds since last validated ledger | 0–10 (healthy), >30 (stale) |
|
||||
| `rippled_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h | Seconds since last published ledger | 0–10 (healthy) |
|
||||
| `rippled_State_Accounting_Disconnected_duration` | NetworkOPs.cpp | Cumulative seconds in Disconnected state | Monotonic |
|
||||
| `rippled_State_Accounting_Connected_duration` | NetworkOPs.cpp | Cumulative seconds in Connected state | Monotonic |
|
||||
| `rippled_State_Accounting_Syncing_duration` | NetworkOPs.cpp | Cumulative seconds in Syncing state | Monotonic |
|
||||
| `rippled_State_Accounting_Tracking_duration` | NetworkOPs.cpp | Cumulative seconds in Tracking state | Monotonic |
|
||||
| `rippled_State_Accounting_Full_duration` | NetworkOPs.cpp | Cumulative seconds in Full state | Monotonic (should dominate) |
|
||||
| `rippled_State_Accounting_Disconnected_transitions` | NetworkOPs.cpp | Count of transitions to Disconnected | Low |
|
||||
| `rippled_State_Accounting_Connected_transitions` | NetworkOPs.cpp | Count of transitions to Connected | Low |
|
||||
| `rippled_State_Accounting_Syncing_transitions` | NetworkOPs.cpp | Count of transitions to Syncing | Low |
|
||||
| `rippled_State_Accounting_Tracking_transitions` | NetworkOPs.cpp | Count of transitions to Tracking | Low |
|
||||
| `rippled_State_Accounting_Full_transitions` | NetworkOPs.cpp | Count of transitions to Full | Low (should be 1 after startup) |
|
||||
| `rippled_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp | Active inbound peer connections | 0–85 |
|
||||
| `rippled_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp | Active outbound peer connections | 10–21 |
|
||||
| `rippled_Overlay_Peer_Disconnects` | OverlayImpl.cpp | Cumulative peer disconnection count | Low growth |
|
||||
| `rippled_job_count` | JobQueue.cpp | Current job queue depth | 0–100 (healthy) |
|
||||
| `rippled_Node_family_full_below_cache_size` | TaggedCache.h | FullBelowCache entry count | Varies |
|
||||
| `rippled_Node_family_full_below_cache_hit_rate` | TaggedCache.h | FullBelowCache hit rate percentage | 0–100 |
|
||||
| Prometheus Metric | Source File | Description | Typical Range |
|
||||
| ------------------------------------------------- | --------------------- | ---------------------------------------- | ------------------------------- |
|
||||
| `xrpld_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h | Seconds since last validated ledger | 0–10 (healthy), >30 (stale) |
|
||||
| `xrpld_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h | Seconds since last published ledger | 0–10 (healthy) |
|
||||
| `xrpld_State_Accounting_Disconnected_duration` | NetworkOPs.cpp | Cumulative seconds in Disconnected state | Monotonic |
|
||||
| `xrpld_State_Accounting_Connected_duration` | NetworkOPs.cpp | Cumulative seconds in Connected state | Monotonic |
|
||||
| `xrpld_State_Accounting_Syncing_duration` | NetworkOPs.cpp | Cumulative seconds in Syncing state | Monotonic |
|
||||
| `xrpld_State_Accounting_Tracking_duration` | NetworkOPs.cpp | Cumulative seconds in Tracking state | Monotonic |
|
||||
| `xrpld_State_Accounting_Full_duration` | NetworkOPs.cpp | Cumulative seconds in Full state | Monotonic (should dominate) |
|
||||
| `xrpld_State_Accounting_Disconnected_transitions` | NetworkOPs.cpp | Count of transitions to Disconnected | Low |
|
||||
| `xrpld_State_Accounting_Connected_transitions` | NetworkOPs.cpp | Count of transitions to Connected | Low |
|
||||
| `xrpld_State_Accounting_Syncing_transitions` | NetworkOPs.cpp | Count of transitions to Syncing | Low |
|
||||
| `xrpld_State_Accounting_Tracking_transitions` | NetworkOPs.cpp | Count of transitions to Tracking | Low |
|
||||
| `xrpld_State_Accounting_Full_transitions` | NetworkOPs.cpp | Count of transitions to Full | Low (should be 1 after startup) |
|
||||
| `xrpld_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp | Active inbound peer connections | 0–85 |
|
||||
| `xrpld_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp | Active outbound peer connections | 10–21 |
|
||||
| `xrpld_Overlay_Peer_Disconnects` | OverlayImpl.cpp | Cumulative peer disconnection count | Low growth |
|
||||
| `xrpld_job_count` | JobQueue.cpp | Current job queue depth | 0–100 (healthy) |
|
||||
| `xrpld_Node_family_full_below_cache_size` | TaggedCache.h | FullBelowCache entry count | Varies |
|
||||
| `xrpld_Node_family_full_below_cache_hit_rate` | TaggedCache.h | FullBelowCache hit rate percentage | 0–100 |
|
||||
|
||||
**Grafana dashboard**: _Node Health (StatsD)_ (`xrpld-statsd-node-health`)
|
||||
|
||||
### 2.2 Counters
|
||||
|
||||
| Prometheus Metric | Source File | Description |
|
||||
| --------------------------------- | ------------------ | --------------------------------------------- |
|
||||
| `rippled_rpc_requests` | ServerHandler.cpp | Total RPC requests received |
|
||||
| `rippled_ledger_fetches` | InboundLedgers.cpp | Inbound ledger fetch attempts |
|
||||
| `rippled_ledger_history_mismatch` | LedgerHistory.cpp | Ledger hash mismatches detected |
|
||||
| `rippled_warn` | Logic.h | Resource manager warnings issued |
|
||||
| `rippled_drop` | Logic.h | Resource manager drops (connections rejected) |
|
||||
| Prometheus Metric | Source File | Description |
|
||||
| ------------------------------- | ------------------ | --------------------------------------------- |
|
||||
| `xrpld_rpc_requests` | ServerHandler.cpp | Total RPC requests received |
|
||||
| `xrpld_ledger_fetches` | InboundLedgers.cpp | Inbound ledger fetch attempts |
|
||||
| `xrpld_ledger_history_mismatch` | LedgerHistory.cpp | Ledger hash mismatches detected |
|
||||
| `xrpld_warn` | Logic.h | Resource manager warnings issued |
|
||||
| `xrpld_drop` | Logic.h | Resource manager drops (connections rejected) |
|
||||
|
||||
**Note**: `rippled_warn` and `rippled_drop` use non-standard StatsD meter type (`|m`). The OTel StatsD receiver only recognizes `|c`, `|g`, `|ms`, `|h`, `|s` — these metrics may be silently dropped. See Known Issues below.
|
||||
**Note**: `xrpld_warn` and `xrpld_drop` use non-standard StatsD meter type (`|m`). The OTel StatsD receiver only recognizes `|c`, `|g`, `|ms`, `|h`, `|s` — these metrics may be silently dropped. See Known Issues below.
|
||||
|
||||
**Grafana dashboard**: _RPC & Pathfinding (StatsD)_ (`xrpld-statsd-rpc`)
|
||||
|
||||
### 2.3 Histograms (from StatsD timers)
|
||||
|
||||
| Prometheus Metric | Source File | Unit | Description |
|
||||
| ----------------------- | ----------------- | ----- | ------------------------------ |
|
||||
| `rippled_rpc_time` | ServerHandler.cpp | ms | RPC response time distribution |
|
||||
| `rippled_rpc_size` | ServerHandler.cpp | bytes | RPC response size distribution |
|
||||
| `rippled_ios_latency` | Application.cpp | ms | I/O service loop latency |
|
||||
| `rippled_pathfind_fast` | PathRequests.h | ms | Fast pathfinding duration |
|
||||
| `rippled_pathfind_full` | PathRequests.h | ms | Full pathfinding duration |
|
||||
| Prometheus Metric | Source File | Unit | Description |
|
||||
| --------------------- | ----------------- | ----- | ------------------------------ |
|
||||
| `xrpld_rpc_time` | ServerHandler.cpp | ms | RPC response time distribution |
|
||||
| `xrpld_rpc_size` | ServerHandler.cpp | bytes | RPC response size distribution |
|
||||
| `xrpld_ios_latency` | Application.cpp | ms | I/O service loop latency |
|
||||
| `xrpld_pathfind_fast` | PathRequests.h | ms | Fast pathfinding duration |
|
||||
| `xrpld_pathfind_full` | PathRequests.h | ms | Full pathfinding duration |
|
||||
|
||||
Quantiles collected: 0th, 50th, 90th, 95th, 99th, 100th percentile.
|
||||
|
||||
@@ -462,10 +462,10 @@ Quantiles collected: 0th, 50th, 90th, 95th, 99th, 100th percentile.
|
||||
|
||||
For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), four gauges are emitted:
|
||||
|
||||
- `rippled_{category}_Bytes_In`
|
||||
- `rippled_{category}_Bytes_Out`
|
||||
- `rippled_{category}_Messages_In`
|
||||
- `rippled_{category}_Messages_Out`
|
||||
- `xrpld_{category}_Bytes_In`
|
||||
- `xrpld_{category}_Bytes_Out`
|
||||
- `xrpld_{category}_Messages_In`
|
||||
- `xrpld_{category}_Messages_Out`
|
||||
|
||||
**Key categories**:
|
||||
|
||||
@@ -490,8 +490,8 @@ For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), fo
|
||||
|
||||
For each of the 36 non-special job types (defined in `JobTypes.h`), two StatsD timer events are emitted:
|
||||
|
||||
- `rippled_{jobName}` — execution duration
|
||||
- `rippled_{jobName}_q` — dequeue wait time
|
||||
- `xrpld_{jobName}` — execution duration
|
||||
- `xrpld_{jobName}_q` — dequeue wait time
|
||||
|
||||
These produce summary metrics with quantiles (0th, 50th, 90th, 95th, 99th, 100th).
|
||||
|
||||
@@ -646,19 +646,19 @@ sum by (xrpl_peer_proposal_trusted) (rate(traces_span_metrics_calls_total{span_n
|
||||
|
||||
```promql
|
||||
# Validated ledger age (should be < 10s)
|
||||
rippled_LedgerMaster_Validated_Ledger_Age
|
||||
xrpld_LedgerMaster_Validated_Ledger_Age
|
||||
|
||||
# Active peer count
|
||||
rippled_Peer_Finder_Active_Inbound_Peers + rippled_Peer_Finder_Active_Outbound_Peers
|
||||
xrpld_Peer_Finder_Active_Inbound_Peers + xrpld_Peer_Finder_Active_Outbound_Peers
|
||||
|
||||
# RPC response time p95
|
||||
histogram_quantile(0.95, rippled_rpc_time_bucket)
|
||||
histogram_quantile(0.95, xrpld_rpc_time_bucket)
|
||||
|
||||
# Total network bytes in (rate)
|
||||
rate(rippled_total_Bytes_In[5m])
|
||||
rate(xrpld_total_Bytes_In[5m])
|
||||
|
||||
# Operating mode (should be "Full" after startup)
|
||||
rippled_State_Accounting_Full_duration
|
||||
xrpld_State_Accounting_Full_duration
|
||||
```
|
||||
|
||||
---
|
||||
@@ -687,8 +687,8 @@ All span names and attributes are defined as compile-time constants in colocated
|
||||
| Issue | Impact | Status |
|
||||
| ------------------------------------------------------------------ | ------------------------------------------------ | -------------------------------------------------------------------- |
|
||||
| `warn` and `drop` metrics use non-standard StatsD `\|m` meter type | Metrics silently dropped by OTel StatsD receiver | Phase 6 Task 6.1 — needs `\|m` → `\|c` change in StatsDCollector.cpp |
|
||||
| `rippled_job_count` may not emit in standalone mode | Missing from Prometheus in some test configs | Requires active job queue activity |
|
||||
| `rippled_rpc_requests` depends on `[insight]` config | Zero series if StatsD not configured | Requires `[insight] server=statsd` in xrpld.cfg |
|
||||
| `xrpld_job_count` may not emit in standalone mode | Missing from Prometheus in some test configs | Requires active job queue activity |
|
||||
| `xrpld_rpc_requests` depends on `[insight]` config | Zero series if StatsD not configured | Requires `[insight] server=statsd` in xrpld.cfg |
|
||||
| Peer tracing disabled by default | No `peer.*` spans unless `trace_peer=1` | Intentional — high volume on mainnet |
|
||||
|
||||
---
|
||||
@@ -720,7 +720,7 @@ enabled=1
|
||||
[insight]
|
||||
server=statsd
|
||||
address=127.0.0.1:8125
|
||||
prefix=rippled
|
||||
prefix=xrpld
|
||||
```
|
||||
|
||||
### Production Setup
|
||||
@@ -737,7 +737,7 @@ max_queue_size=4096
|
||||
[insight]
|
||||
server=statsd
|
||||
address=otel-collector:8125
|
||||
prefix=rippled
|
||||
prefix=xrpld
|
||||
```
|
||||
|
||||
### Trace Category Toggle
|
||||
|
||||
@@ -288,7 +288,7 @@ Add to `xrpld.cfg`:
|
||||
[insight]
|
||||
server=statsd
|
||||
address=127.0.0.1:8125
|
||||
prefix=rippled
|
||||
prefix=xrpld
|
||||
```
|
||||
|
||||
The OTel Collector receives these via a `statsd` receiver on UDP port 8125 and exports them to Prometheus alongside spanmetrics.
|
||||
@@ -297,38 +297,38 @@ The OTel Collector receives these via a `statsd` receiver on UDP port 8125 and e
|
||||
|
||||
#### Gauges
|
||||
|
||||
| Prometheus Metric | Source | Description |
|
||||
| --------------------------------------------- | ------------------------- | -------------------------------------------------------------------------- |
|
||||
| `rippled_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h:373 | Age of validated ledger (seconds) |
|
||||
| `rippled_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h:374 | Age of published ledger (seconds) |
|
||||
| `rippled_State_Accounting_{Mode}_duration` | NetworkOPs.cpp:774 | Time in each operating mode (Disconnected/Connected/Syncing/Tracking/Full) |
|
||||
| `rippled_State_Accounting_{Mode}_transitions` | NetworkOPs.cpp:780 | Transition count per mode |
|
||||
| `rippled_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp:214 | Active inbound peer connections |
|
||||
| `rippled_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp:215 | Active outbound peer connections |
|
||||
| `rippled_Overlay_Peer_Disconnects` | OverlayImpl.h:557 | Peer disconnect count |
|
||||
| `rippled_job_count` | JobQueue.cpp:26 | Current job queue depth |
|
||||
| `rippled_{category}_Bytes_In/Out` | OverlayImpl.h:535 | Overlay traffic bytes per category (57 categories) |
|
||||
| `rippled_{category}_Messages_In/Out` | OverlayImpl.h:535 | Overlay traffic messages per category |
|
||||
| Prometheus Metric | Source | Description |
|
||||
| ------------------------------------------- | ------------------------- | -------------------------------------------------------------------------- |
|
||||
| `xrpld_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h:373 | Age of validated ledger (seconds) |
|
||||
| `xrpld_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h:374 | Age of published ledger (seconds) |
|
||||
| `xrpld_State_Accounting_{Mode}_duration` | NetworkOPs.cpp:774 | Time in each operating mode (Disconnected/Connected/Syncing/Tracking/Full) |
|
||||
| `xrpld_State_Accounting_{Mode}_transitions` | NetworkOPs.cpp:780 | Transition count per mode |
|
||||
| `xrpld_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp:214 | Active inbound peer connections |
|
||||
| `xrpld_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp:215 | Active outbound peer connections |
|
||||
| `xrpld_Overlay_Peer_Disconnects` | OverlayImpl.h:557 | Peer disconnect count |
|
||||
| `xrpld_job_count` | JobQueue.cpp:26 | Current job queue depth |
|
||||
| `xrpld_{category}_Bytes_In/Out` | OverlayImpl.h:535 | Overlay traffic bytes per category (57 categories) |
|
||||
| `xrpld_{category}_Messages_In/Out` | OverlayImpl.h:535 | Overlay traffic messages per category |
|
||||
|
||||
#### Counters
|
||||
|
||||
| Prometheus Metric | Source | Description |
|
||||
| --------------------------------- | --------------------- | ------------------------------ |
|
||||
| `rippled_rpc_requests` | ServerHandler.cpp:108 | Total RPC request count |
|
||||
| `rippled_ledger_fetches` | InboundLedgers.cpp:44 | Ledger fetch request count |
|
||||
| `rippled_ledger_history_mismatch` | LedgerHistory.cpp:16 | Ledger hash mismatch count |
|
||||
| `rippled_warn` | Logic.h:33 | Resource manager warning count |
|
||||
| `rippled_drop` | Logic.h:34 | Resource manager drop count |
|
||||
| Prometheus Metric | Source | Description |
|
||||
| ------------------------------- | --------------------- | ------------------------------ |
|
||||
| `xrpld_rpc_requests` | ServerHandler.cpp:108 | Total RPC request count |
|
||||
| `xrpld_ledger_fetches` | InboundLedgers.cpp:44 | Ledger fetch request count |
|
||||
| `xrpld_ledger_history_mismatch` | LedgerHistory.cpp:16 | Ledger hash mismatch count |
|
||||
| `xrpld_warn` | Logic.h:33 | Resource manager warning count |
|
||||
| `xrpld_drop` | Logic.h:34 | Resource manager drop count |
|
||||
|
||||
#### Histograms (from StatsD timers)
|
||||
|
||||
| Prometheus Metric | Source | Description |
|
||||
| ----------------------- | --------------------- | ------------------------------ |
|
||||
| `rippled_rpc_time` | ServerHandler.cpp:110 | RPC response time (ms) |
|
||||
| `rippled_rpc_size` | ServerHandler.cpp:109 | RPC response size (bytes) |
|
||||
| `rippled_ios_latency` | Application.cpp:438 | I/O service loop latency (ms) |
|
||||
| `rippled_pathfind_fast` | PathRequests.h:23 | Fast pathfinding duration (ms) |
|
||||
| `rippled_pathfind_full` | PathRequests.h:24 | Full pathfinding duration (ms) |
|
||||
| Prometheus Metric | Source | Description |
|
||||
| --------------------- | --------------------- | ------------------------------ |
|
||||
| `xrpld_rpc_time` | ServerHandler.cpp:110 | RPC response time (ms) |
|
||||
| `xrpld_rpc_size` | ServerHandler.cpp:109 | RPC response size (bytes) |
|
||||
| `xrpld_ios_latency` | Application.cpp:438 | I/O service loop latency (ms) |
|
||||
| `xrpld_pathfind_fast` | PathRequests.h:23 | Fast pathfinding duration (ms) |
|
||||
| `xrpld_pathfind_full` | PathRequests.h:24 | Full pathfinding duration (ms) |
|
||||
|
||||
## Grafana Dashboards
|
||||
|
||||
@@ -401,52 +401,52 @@ Requires `trace_peer=1` in the `[telemetry]` config section.
|
||||
|
||||
### Node Health -- StatsD (`xrpld-statsd-node-health`)
|
||||
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| -------------------------------------- | ---------- | ----------------------------------------------------------------- | ----------- |
|
||||
| Validated Ledger Age | stat | `rippled_LedgerMaster_Validated_Ledger_Age` | — |
|
||||
| Published Ledger Age | stat | `rippled_LedgerMaster_Published_Ledger_Age` | — |
|
||||
| Operating Mode Duration | timeseries | `rippled_State_Accounting_*_duration` | — |
|
||||
| Operating Mode Transitions | timeseries | `rippled_State_Accounting_*_transitions` | — |
|
||||
| I/O Latency | timeseries | `histogram_quantile(0.95, rippled_ios_latency_bucket)` | — |
|
||||
| Job Queue Depth | timeseries | `rippled_job_count` | — |
|
||||
| Ledger Fetch Rate | stat | `rate(rippled_ledger_fetches[5m])` | — |
|
||||
| Ledger History Mismatches | stat | `rate(rippled_ledger_history_mismatch[5m])` | — |
|
||||
| Key Jobs Execution Time | timeseries | `rippled_acceptLedger{quantile="$quantile"}` (+ 10 more key jobs) | `quantile` |
|
||||
| Key Jobs Dequeue Wait Time | timeseries | `rippled_acceptLedger_q{quantile="$quantile"}` (+ 10 more) | `quantile` |
|
||||
| FullBelowCache Size | timeseries | `rippled_Node_family_full_below_cache_size` | — |
|
||||
| FullBelowCache Hit Rate | gauge | `rippled_Node_family_full_below_cache_hit_rate` | — |
|
||||
| Ledger Publish Gap | stat | `Published_Ledger_Age - Validated_Ledger_Age` | — |
|
||||
| State Duration Rate (Full vs Tracking) | timeseries | `rate(rippled_State_Accounting_Full_duration[5m]) / 1000000` | — |
|
||||
| All Jobs Execution Time (Detail) | timeseries | `{__name__=~"rippled_<all_jobs>", quantile="$quantile"}` | `quantile` |
|
||||
| All Jobs Dequeue Wait (Detail) | timeseries | `{__name__=~"rippled_<all_jobs>_q", quantile="$quantile"}` | `quantile` |
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| -------------------------------------- | ---------- | --------------------------------------------------------------- | ----------- |
|
||||
| Validated Ledger Age | stat | `xrpld_LedgerMaster_Validated_Ledger_Age` | — |
|
||||
| Published Ledger Age | stat | `xrpld_LedgerMaster_Published_Ledger_Age` | — |
|
||||
| Operating Mode Duration | timeseries | `xrpld_State_Accounting_*_duration` | — |
|
||||
| Operating Mode Transitions | timeseries | `xrpld_State_Accounting_*_transitions` | — |
|
||||
| I/O Latency | timeseries | `histogram_quantile(0.95, xrpld_ios_latency_bucket)` | — |
|
||||
| Job Queue Depth | timeseries | `xrpld_job_count` | — |
|
||||
| Ledger Fetch Rate | stat | `rate(xrpld_ledger_fetches[5m])` | — |
|
||||
| Ledger History Mismatches | stat | `rate(xrpld_ledger_history_mismatch[5m])` | — |
|
||||
| Key Jobs Execution Time | timeseries | `xrpld_acceptLedger{quantile="$quantile"}` (+ 10 more key jobs) | `quantile` |
|
||||
| Key Jobs Dequeue Wait Time | timeseries | `xrpld_acceptLedger_q{quantile="$quantile"}` (+ 10 more) | `quantile` |
|
||||
| FullBelowCache Size | timeseries | `xrpld_Node_family_full_below_cache_size` | — |
|
||||
| FullBelowCache Hit Rate | gauge | `xrpld_Node_family_full_below_cache_hit_rate` | — |
|
||||
| Ledger Publish Gap | stat | `Published_Ledger_Age - Validated_Ledger_Age` | — |
|
||||
| State Duration Rate (Full vs Tracking) | timeseries | `rate(xrpld_State_Accounting_Full_duration[5m]) / 1000000` | — |
|
||||
| All Jobs Execution Time (Detail) | timeseries | `{__name__=~"xrpld_<all_jobs>", quantile="$quantile"}` | `quantile` |
|
||||
| All Jobs Dequeue Wait (Detail) | timeseries | `{__name__=~"xrpld_<all_jobs>_q", quantile="$quantile"}` | `quantile` |
|
||||
|
||||
### Network Traffic -- StatsD (`xrpld-statsd-network`)
|
||||
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| ------------------------------------ | ---------- | -------------------------------------------- | ----------- |
|
||||
| Active Peers | timeseries | `rippled_Peer_Finder_Active_*_Peers` | — |
|
||||
| Peer Disconnects | timeseries | `rippled_Overlay_Peer_Disconnects` | — |
|
||||
| Total Network Bytes | timeseries | `rate(rippled_total_Bytes_In/Out[5m])` | — |
|
||||
| Total Network Messages | timeseries | `rippled_total_Messages_In/Out` | — |
|
||||
| Transaction Traffic | timeseries | `rippled_transactions_Messages_In/Out` | — |
|
||||
| Proposal Traffic | timeseries | `rippled_proposals_Messages_In/Out` | — |
|
||||
| Validation Traffic | timeseries | `rippled_validations_Messages_In/Out` | — |
|
||||
| Traffic by Category | bargauge | `topk(10, rippled_*_Bytes_In)` | — |
|
||||
| Duplicate Traffic (Wasted Bandwidth) | timeseries | `rate(rippled_*_duplicate_Bytes_In/Out[5m])` | — |
|
||||
| All Traffic Categories (Detail) | timeseries | `topk(15, rate(rippled_*_Bytes_In[5m]))` | — |
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| ------------------------------------ | ---------- | ------------------------------------------ | ----------- |
|
||||
| Active Peers | timeseries | `xrpld_Peer_Finder_Active_*_Peers` | — |
|
||||
| Peer Disconnects | timeseries | `xrpld_Overlay_Peer_Disconnects` | — |
|
||||
| Total Network Bytes | timeseries | `rate(xrpld_total_Bytes_In/Out[5m])` | — |
|
||||
| Total Network Messages | timeseries | `xrpld_total_Messages_In/Out` | — |
|
||||
| Transaction Traffic | timeseries | `xrpld_transactions_Messages_In/Out` | — |
|
||||
| Proposal Traffic | timeseries | `xrpld_proposals_Messages_In/Out` | — |
|
||||
| Validation Traffic | timeseries | `xrpld_validations_Messages_In/Out` | — |
|
||||
| Traffic by Category | bargauge | `topk(10, xrpld_*_Bytes_In)` | — |
|
||||
| Duplicate Traffic (Wasted Bandwidth) | timeseries | `rate(xrpld_*_duplicate_Bytes_In/Out[5m])` | — |
|
||||
| All Traffic Categories (Detail) | timeseries | `topk(15, rate(xrpld_*_Bytes_In[5m]))` | — |
|
||||
|
||||
### RPC & Pathfinding -- StatsD (`xrpld-statsd-rpc`)
|
||||
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| ------------------------- | ---------- | -------------------------------------------------------- | ----------- |
|
||||
| RPC Request Rate | stat | `rate(rippled_rpc_requests[5m])` | — |
|
||||
| RPC Response Time | timeseries | `histogram_quantile(0.95, rippled_rpc_time_bucket)` | — |
|
||||
| RPC Response Size | timeseries | `histogram_quantile(0.95, rippled_rpc_size_bucket)` | — |
|
||||
| RPC Response Time Heatmap | heatmap | `rippled_rpc_time_bucket` | — |
|
||||
| Pathfinding Fast Duration | timeseries | `histogram_quantile(0.95, rippled_pathfind_fast_bucket)` | — |
|
||||
| Pathfinding Full Duration | timeseries | `histogram_quantile(0.95, rippled_pathfind_full_bucket)` | — |
|
||||
| Resource Warnings Rate | stat | `rate(rippled_warn[5m])` | — |
|
||||
| Resource Drops Rate | stat | `rate(rippled_drop[5m])` | — |
|
||||
| Panel | Type | PromQL | Labels Used |
|
||||
| ------------------------- | ---------- | ------------------------------------------------------ | ----------- |
|
||||
| RPC Request Rate | stat | `rate(xrpld_rpc_requests[5m])` | — |
|
||||
| RPC Response Time | timeseries | `histogram_quantile(0.95, xrpld_rpc_time_bucket)` | — |
|
||||
| RPC Response Size | timeseries | `histogram_quantile(0.95, xrpld_rpc_size_bucket)` | — |
|
||||
| RPC Response Time Heatmap | heatmap | `xrpld_rpc_time_bucket` | — |
|
||||
| Pathfinding Fast Duration | timeseries | `histogram_quantile(0.95, xrpld_pathfind_fast_bucket)` | — |
|
||||
| Pathfinding Full Duration | timeseries | `histogram_quantile(0.95, xrpld_pathfind_full_bucket)` | — |
|
||||
| Resource Warnings Rate | stat | `rate(xrpld_warn[5m])` | — |
|
||||
| Resource Drops Rate | stat | `rate(xrpld_drop[5m])` | — |
|
||||
|
||||
### Span → Metric → Dashboard Summary
|
||||
|
||||
|
||||
@@ -14,13 +14,15 @@
|
||||
#include <xrpl/ledger/Ledger.h>
|
||||
#include <xrpl/ledger/OpenView.h>
|
||||
#include <xrpl/nodestore/NodeObject.h>
|
||||
#include <xrpl/protocol/Feature.h>
|
||||
#include <xrpl/protocol/LedgerHeader.h>
|
||||
#include <xrpl/protocol/Protocol.h>
|
||||
#include <xrpl/telemetry/SpanGuard.h>
|
||||
#include <xrpl/telemetry/SpanNames.h>
|
||||
#include <xrpl/tx/apply.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <exception>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
|
||||
@@ -57,6 +57,7 @@
|
||||
#include <xrpl/shamap/SHAMapMissingNode.h>
|
||||
#include <xrpl/shamap/SHAMapTreeNode.h>
|
||||
#include <xrpl/telemetry/SpanGuard.h>
|
||||
#include <xrpl/telemetry/SpanNames.h>
|
||||
|
||||
#include <boost/icl/concept/interval_set.hpp>
|
||||
|
||||
|
||||
@@ -15,9 +15,7 @@
|
||||
|
||||
#include <xrpl/telemetry/SpanNames.h>
|
||||
|
||||
namespace xrpl {
|
||||
namespace telemetry {
|
||||
namespace ledger_span {
|
||||
namespace xrpl::telemetry::ledger_span {
|
||||
|
||||
// ===== Span operation suffixes ===============================================
|
||||
|
||||
@@ -49,6 +47,4 @@ inline constexpr auto txFailed = join(xrplLedger, makeStr("tx_failed"));
|
||||
inline constexpr auto validations = join(xrplLedger, makeStr("validations"));
|
||||
} // namespace attr
|
||||
|
||||
} // namespace ledger_span
|
||||
} // namespace telemetry
|
||||
} // namespace xrpl
|
||||
} // namespace xrpl::telemetry::ledger_span
|
||||
|
||||
@@ -68,6 +68,7 @@
|
||||
#include <xrpl/server/NetworkOPs.h>
|
||||
#include <xrpl/shamap/SHAMapNodeID.h>
|
||||
#include <xrpl/telemetry/SpanGuard.h>
|
||||
#include <xrpl/telemetry/SpanNames.h>
|
||||
#include <xrpl/tx/apply.h>
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
@@ -1966,17 +1967,16 @@ PeerImp::onMessage(std::shared_ptr<protocol::TMProposeSet> const& m)
|
||||
app_.getTimeKeeper().closeTime(),
|
||||
calcNodeID(app_.getValidatorManifests().getMasterKey(publicKey))});
|
||||
|
||||
// Create a receive span that links to the sender's trace context
|
||||
// (if propagated). shared_ptr keeps it alive across the job boundary.
|
||||
auto span = std::make_shared<telemetry::SpanGuard>(telemetry::proposalReceiveSpan(set));
|
||||
span->setAttribute(telemetry::cons_span::attr::trusted, isTrusted);
|
||||
span->setAttribute(telemetry::cons_span::attr::round, static_cast<int64_t>(set.proposeseq()));
|
||||
auto consSpan = std::make_shared<telemetry::SpanGuard>(telemetry::proposalReceiveSpan(set));
|
||||
consSpan->setAttribute(telemetry::cons_span::attr::trusted, isTrusted);
|
||||
consSpan->setAttribute(
|
||||
telemetry::cons_span::attr::round, static_cast<int64_t>(set.proposeseq()));
|
||||
|
||||
std::weak_ptr<PeerImp> const weak = shared_from_this();
|
||||
app_.getJobQueue().addJob(
|
||||
isTrusted ? jtPROPOSAL_t : jtPROPOSAL_ut,
|
||||
"checkPropose",
|
||||
[weak, isTrusted, m, proposal, sp = std::move(span)]() {
|
||||
[weak, isTrusted, m, proposal, sp = std::move(consSpan)]() {
|
||||
if (auto peer = weak.lock())
|
||||
peer->checkPropose(isTrusted, m, proposal);
|
||||
});
|
||||
@@ -2560,13 +2560,12 @@ PeerImp::onMessage(std::shared_ptr<protocol::TMValidation> const& m)
|
||||
return;
|
||||
}
|
||||
|
||||
// Create a receive span that links to the sender's trace context
|
||||
// (if propagated). shared_ptr keeps it alive across the job boundary.
|
||||
auto span = std::make_shared<telemetry::SpanGuard>(telemetry::validationReceiveSpan(*m));
|
||||
span->setAttribute(telemetry::cons_span::attr::trusted, isTrusted);
|
||||
auto consSpan =
|
||||
std::make_shared<telemetry::SpanGuard>(telemetry::validationReceiveSpan(*m));
|
||||
consSpan->setAttribute(telemetry::cons_span::attr::trusted, isTrusted);
|
||||
if (val->isFieldPresent(sfLedgerSequence))
|
||||
{
|
||||
span->setAttribute(
|
||||
consSpan->setAttribute(
|
||||
telemetry::cons_span::attr::ledgerSeq,
|
||||
static_cast<int64_t>(val->getFieldU32(sfLedgerSequence)));
|
||||
}
|
||||
@@ -2583,7 +2582,7 @@ PeerImp::onMessage(std::shared_ptr<protocol::TMValidation> const& m)
|
||||
app_.getJobQueue().addJob(
|
||||
isTrusted ? jtVALIDATION_t : jtVALIDATION_ut,
|
||||
name,
|
||||
[weak, val, m, key, sp = std::move(span)]() {
|
||||
[weak, val, m, key, sp = std::move(consSpan)]() {
|
||||
if (auto peer = weak.lock())
|
||||
peer->checkValidation(val, key, m);
|
||||
});
|
||||
|
||||
@@ -13,9 +13,7 @@
|
||||
|
||||
#include <xrpl/telemetry/SpanNames.h>
|
||||
|
||||
namespace xrpl {
|
||||
namespace telemetry {
|
||||
namespace peer_span {
|
||||
namespace xrpl::telemetry::peer_span {
|
||||
|
||||
// ===== Span operation suffixes ===============================================
|
||||
|
||||
@@ -45,6 +43,4 @@ inline constexpr auto validationTrusted =
|
||||
join(join(xrplPeer, makeStr("validation")), makeStr("trusted"));
|
||||
} // namespace attr
|
||||
|
||||
} // namespace peer_span
|
||||
} // namespace telemetry
|
||||
} // namespace xrpl
|
||||
} // namespace xrpl::telemetry::peer_span
|
||||
|
||||
Reference in New Issue
Block a user