diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md index ebfb58b7eb..0ce4f363eb 100644 --- a/OpenTelemetryPlan/09-data-collection-reference.md +++ b/OpenTelemetryPlan/09-data-collection-reference.md @@ -42,7 +42,7 @@ graph LR BP -->|"OTLP/gRPC :4317"| D SM -->|"span_calls_total
span_duration_ms
(6 dimension labels)"| E - R2 -->|"rippled_* gauges
rippled_* counters
rippled_* summaries"| E + R2 -->|"xrpld_* gauges
xrpld_* counters
xrpld_* summaries"| E E -->|"Prometheus
data source"| F D -->|"Tempo
data source"| F @@ -400,59 +400,59 @@ These are system-level metrics emitted by xrpld's `beast::insight` framework via [insight] server=statsd address=127.0.0.1:8125 -prefix=rippled +prefix=xrpld ``` -> **Note**: The `prefix` value is user-configurable — all metric names in the tables below assume `prefix=rippled` (matching the integration test and Grafana dashboards). If you change the prefix, replace `rippled_` with `{your_prefix}_` in all PromQL queries. +> **Note**: The `prefix` value is user-configurable — all metric names in the tables below assume `prefix=xrpld` (matching the integration test and Grafana dashboards). If you change the prefix, replace `xrpld_` with `{your_prefix}_` in all PromQL queries. ### 2.1 Gauges -| Prometheus Metric | Source File | Description | Typical Range | -| --------------------------------------------------- | --------------------- | ---------------------------------------- | ------------------------------- | -| `rippled_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h | Seconds since last validated ledger | 0–10 (healthy), >30 (stale) | -| `rippled_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h | Seconds since last published ledger | 0–10 (healthy) | -| `rippled_State_Accounting_Disconnected_duration` | NetworkOPs.cpp | Cumulative seconds in Disconnected state | Monotonic | -| `rippled_State_Accounting_Connected_duration` | NetworkOPs.cpp | Cumulative seconds in Connected state | Monotonic | -| `rippled_State_Accounting_Syncing_duration` | NetworkOPs.cpp | Cumulative seconds in Syncing state | Monotonic | -| `rippled_State_Accounting_Tracking_duration` | NetworkOPs.cpp | Cumulative seconds in Tracking state | Monotonic | -| `rippled_State_Accounting_Full_duration` | NetworkOPs.cpp | Cumulative seconds in Full state | Monotonic (should dominate) | -| `rippled_State_Accounting_Disconnected_transitions` | NetworkOPs.cpp | Count of transitions to Disconnected | Low | -| `rippled_State_Accounting_Connected_transitions` | NetworkOPs.cpp | Count of transitions to Connected | Low | -| `rippled_State_Accounting_Syncing_transitions` | NetworkOPs.cpp | Count of transitions to Syncing | Low | -| `rippled_State_Accounting_Tracking_transitions` | NetworkOPs.cpp | Count of transitions to Tracking | Low | -| `rippled_State_Accounting_Full_transitions` | NetworkOPs.cpp | Count of transitions to Full | Low (should be 1 after startup) | -| `rippled_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp | Active inbound peer connections | 0–85 | -| `rippled_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp | Active outbound peer connections | 10–21 | -| `rippled_Overlay_Peer_Disconnects` | OverlayImpl.cpp | Cumulative peer disconnection count | Low growth | -| `rippled_job_count` | JobQueue.cpp | Current job queue depth | 0–100 (healthy) | -| `rippled_Node_family_full_below_cache_size` | TaggedCache.h | FullBelowCache entry count | Varies | -| `rippled_Node_family_full_below_cache_hit_rate` | TaggedCache.h | FullBelowCache hit rate percentage | 0–100 | +| Prometheus Metric | Source File | Description | Typical Range | +| ------------------------------------------------- | --------------------- | ---------------------------------------- | ------------------------------- | +| `xrpld_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h | Seconds since last validated ledger | 0–10 (healthy), >30 (stale) | +| `xrpld_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h | Seconds since last published ledger | 0–10 (healthy) | +| `xrpld_State_Accounting_Disconnected_duration` | NetworkOPs.cpp | Cumulative seconds in Disconnected state | Monotonic | +| `xrpld_State_Accounting_Connected_duration` | NetworkOPs.cpp | Cumulative seconds in Connected state | Monotonic | +| `xrpld_State_Accounting_Syncing_duration` | NetworkOPs.cpp | Cumulative seconds in Syncing state | Monotonic | +| `xrpld_State_Accounting_Tracking_duration` | NetworkOPs.cpp | Cumulative seconds in Tracking state | Monotonic | +| `xrpld_State_Accounting_Full_duration` | NetworkOPs.cpp | Cumulative seconds in Full state | Monotonic (should dominate) | +| `xrpld_State_Accounting_Disconnected_transitions` | NetworkOPs.cpp | Count of transitions to Disconnected | Low | +| `xrpld_State_Accounting_Connected_transitions` | NetworkOPs.cpp | Count of transitions to Connected | Low | +| `xrpld_State_Accounting_Syncing_transitions` | NetworkOPs.cpp | Count of transitions to Syncing | Low | +| `xrpld_State_Accounting_Tracking_transitions` | NetworkOPs.cpp | Count of transitions to Tracking | Low | +| `xrpld_State_Accounting_Full_transitions` | NetworkOPs.cpp | Count of transitions to Full | Low (should be 1 after startup) | +| `xrpld_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp | Active inbound peer connections | 0–85 | +| `xrpld_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp | Active outbound peer connections | 10–21 | +| `xrpld_Overlay_Peer_Disconnects` | OverlayImpl.cpp | Cumulative peer disconnection count | Low growth | +| `xrpld_job_count` | JobQueue.cpp | Current job queue depth | 0–100 (healthy) | +| `xrpld_Node_family_full_below_cache_size` | TaggedCache.h | FullBelowCache entry count | Varies | +| `xrpld_Node_family_full_below_cache_hit_rate` | TaggedCache.h | FullBelowCache hit rate percentage | 0–100 | **Grafana dashboard**: _Node Health (StatsD)_ (`xrpld-statsd-node-health`) ### 2.2 Counters -| Prometheus Metric | Source File | Description | -| --------------------------------- | ------------------ | --------------------------------------------- | -| `rippled_rpc_requests` | ServerHandler.cpp | Total RPC requests received | -| `rippled_ledger_fetches` | InboundLedgers.cpp | Inbound ledger fetch attempts | -| `rippled_ledger_history_mismatch` | LedgerHistory.cpp | Ledger hash mismatches detected | -| `rippled_warn` | Logic.h | Resource manager warnings issued | -| `rippled_drop` | Logic.h | Resource manager drops (connections rejected) | +| Prometheus Metric | Source File | Description | +| ------------------------------- | ------------------ | --------------------------------------------- | +| `xrpld_rpc_requests` | ServerHandler.cpp | Total RPC requests received | +| `xrpld_ledger_fetches` | InboundLedgers.cpp | Inbound ledger fetch attempts | +| `xrpld_ledger_history_mismatch` | LedgerHistory.cpp | Ledger hash mismatches detected | +| `xrpld_warn` | Logic.h | Resource manager warnings issued | +| `xrpld_drop` | Logic.h | Resource manager drops (connections rejected) | -**Note**: `rippled_warn` and `rippled_drop` use non-standard StatsD meter type (`|m`). The OTel StatsD receiver only recognizes `|c`, `|g`, `|ms`, `|h`, `|s` — these metrics may be silently dropped. See Known Issues below. +**Note**: `xrpld_warn` and `xrpld_drop` use non-standard StatsD meter type (`|m`). The OTel StatsD receiver only recognizes `|c`, `|g`, `|ms`, `|h`, `|s` — these metrics may be silently dropped. See Known Issues below. **Grafana dashboard**: _RPC & Pathfinding (StatsD)_ (`xrpld-statsd-rpc`) ### 2.3 Histograms (from StatsD timers) -| Prometheus Metric | Source File | Unit | Description | -| ----------------------- | ----------------- | ----- | ------------------------------ | -| `rippled_rpc_time` | ServerHandler.cpp | ms | RPC response time distribution | -| `rippled_rpc_size` | ServerHandler.cpp | bytes | RPC response size distribution | -| `rippled_ios_latency` | Application.cpp | ms | I/O service loop latency | -| `rippled_pathfind_fast` | PathRequests.h | ms | Fast pathfinding duration | -| `rippled_pathfind_full` | PathRequests.h | ms | Full pathfinding duration | +| Prometheus Metric | Source File | Unit | Description | +| --------------------- | ----------------- | ----- | ------------------------------ | +| `xrpld_rpc_time` | ServerHandler.cpp | ms | RPC response time distribution | +| `xrpld_rpc_size` | ServerHandler.cpp | bytes | RPC response size distribution | +| `xrpld_ios_latency` | Application.cpp | ms | I/O service loop latency | +| `xrpld_pathfind_fast` | PathRequests.h | ms | Fast pathfinding duration | +| `xrpld_pathfind_full` | PathRequests.h | ms | Full pathfinding duration | Quantiles collected: 0th, 50th, 90th, 95th, 99th, 100th percentile. @@ -462,10 +462,10 @@ Quantiles collected: 0th, 50th, 90th, 95th, 99th, 100th percentile. For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), four gauges are emitted: -- `rippled_{category}_Bytes_In` -- `rippled_{category}_Bytes_Out` -- `rippled_{category}_Messages_In` -- `rippled_{category}_Messages_Out` +- `xrpld_{category}_Bytes_In` +- `xrpld_{category}_Bytes_Out` +- `xrpld_{category}_Messages_In` +- `xrpld_{category}_Messages_Out` **Key categories**: @@ -490,8 +490,8 @@ For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), fo For each of the 36 non-special job types (defined in `JobTypes.h`), two StatsD timer events are emitted: -- `rippled_{jobName}` — execution duration -- `rippled_{jobName}_q` — dequeue wait time +- `xrpld_{jobName}` — execution duration +- `xrpld_{jobName}_q` — dequeue wait time These produce summary metrics with quantiles (0th, 50th, 90th, 95th, 99th, 100th). @@ -646,19 +646,19 @@ sum by (xrpl_peer_proposal_trusted) (rate(traces_span_metrics_calls_total{span_n ```promql # Validated ledger age (should be < 10s) -rippled_LedgerMaster_Validated_Ledger_Age +xrpld_LedgerMaster_Validated_Ledger_Age # Active peer count -rippled_Peer_Finder_Active_Inbound_Peers + rippled_Peer_Finder_Active_Outbound_Peers +xrpld_Peer_Finder_Active_Inbound_Peers + xrpld_Peer_Finder_Active_Outbound_Peers # RPC response time p95 -histogram_quantile(0.95, rippled_rpc_time_bucket) +histogram_quantile(0.95, xrpld_rpc_time_bucket) # Total network bytes in (rate) -rate(rippled_total_Bytes_In[5m]) +rate(xrpld_total_Bytes_In[5m]) # Operating mode (should be "Full" after startup) -rippled_State_Accounting_Full_duration +xrpld_State_Accounting_Full_duration ``` --- @@ -687,8 +687,8 @@ All span names and attributes are defined as compile-time constants in colocated | Issue | Impact | Status | | ------------------------------------------------------------------ | ------------------------------------------------ | -------------------------------------------------------------------- | | `warn` and `drop` metrics use non-standard StatsD `\|m` meter type | Metrics silently dropped by OTel StatsD receiver | Phase 6 Task 6.1 — needs `\|m` → `\|c` change in StatsDCollector.cpp | -| `rippled_job_count` may not emit in standalone mode | Missing from Prometheus in some test configs | Requires active job queue activity | -| `rippled_rpc_requests` depends on `[insight]` config | Zero series if StatsD not configured | Requires `[insight] server=statsd` in xrpld.cfg | +| `xrpld_job_count` may not emit in standalone mode | Missing from Prometheus in some test configs | Requires active job queue activity | +| `xrpld_rpc_requests` depends on `[insight]` config | Zero series if StatsD not configured | Requires `[insight] server=statsd` in xrpld.cfg | | Peer tracing disabled by default | No `peer.*` spans unless `trace_peer=1` | Intentional — high volume on mainnet | --- @@ -720,7 +720,7 @@ enabled=1 [insight] server=statsd address=127.0.0.1:8125 -prefix=rippled +prefix=xrpld ``` ### Production Setup @@ -737,7 +737,7 @@ max_queue_size=4096 [insight] server=statsd address=otel-collector:8125 -prefix=rippled +prefix=xrpld ``` ### Trace Category Toggle diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index d159d44a2f..7429a6cf13 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -288,7 +288,7 @@ Add to `xrpld.cfg`: [insight] server=statsd address=127.0.0.1:8125 -prefix=rippled +prefix=xrpld ``` The OTel Collector receives these via a `statsd` receiver on UDP port 8125 and exports them to Prometheus alongside spanmetrics. @@ -297,38 +297,38 @@ The OTel Collector receives these via a `statsd` receiver on UDP port 8125 and e #### Gauges -| Prometheus Metric | Source | Description | -| --------------------------------------------- | ------------------------- | -------------------------------------------------------------------------- | -| `rippled_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h:373 | Age of validated ledger (seconds) | -| `rippled_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h:374 | Age of published ledger (seconds) | -| `rippled_State_Accounting_{Mode}_duration` | NetworkOPs.cpp:774 | Time in each operating mode (Disconnected/Connected/Syncing/Tracking/Full) | -| `rippled_State_Accounting_{Mode}_transitions` | NetworkOPs.cpp:780 | Transition count per mode | -| `rippled_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp:214 | Active inbound peer connections | -| `rippled_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp:215 | Active outbound peer connections | -| `rippled_Overlay_Peer_Disconnects` | OverlayImpl.h:557 | Peer disconnect count | -| `rippled_job_count` | JobQueue.cpp:26 | Current job queue depth | -| `rippled_{category}_Bytes_In/Out` | OverlayImpl.h:535 | Overlay traffic bytes per category (57 categories) | -| `rippled_{category}_Messages_In/Out` | OverlayImpl.h:535 | Overlay traffic messages per category | +| Prometheus Metric | Source | Description | +| ------------------------------------------- | ------------------------- | -------------------------------------------------------------------------- | +| `xrpld_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h:373 | Age of validated ledger (seconds) | +| `xrpld_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h:374 | Age of published ledger (seconds) | +| `xrpld_State_Accounting_{Mode}_duration` | NetworkOPs.cpp:774 | Time in each operating mode (Disconnected/Connected/Syncing/Tracking/Full) | +| `xrpld_State_Accounting_{Mode}_transitions` | NetworkOPs.cpp:780 | Transition count per mode | +| `xrpld_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp:214 | Active inbound peer connections | +| `xrpld_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp:215 | Active outbound peer connections | +| `xrpld_Overlay_Peer_Disconnects` | OverlayImpl.h:557 | Peer disconnect count | +| `xrpld_job_count` | JobQueue.cpp:26 | Current job queue depth | +| `xrpld_{category}_Bytes_In/Out` | OverlayImpl.h:535 | Overlay traffic bytes per category (57 categories) | +| `xrpld_{category}_Messages_In/Out` | OverlayImpl.h:535 | Overlay traffic messages per category | #### Counters -| Prometheus Metric | Source | Description | -| --------------------------------- | --------------------- | ------------------------------ | -| `rippled_rpc_requests` | ServerHandler.cpp:108 | Total RPC request count | -| `rippled_ledger_fetches` | InboundLedgers.cpp:44 | Ledger fetch request count | -| `rippled_ledger_history_mismatch` | LedgerHistory.cpp:16 | Ledger hash mismatch count | -| `rippled_warn` | Logic.h:33 | Resource manager warning count | -| `rippled_drop` | Logic.h:34 | Resource manager drop count | +| Prometheus Metric | Source | Description | +| ------------------------------- | --------------------- | ------------------------------ | +| `xrpld_rpc_requests` | ServerHandler.cpp:108 | Total RPC request count | +| `xrpld_ledger_fetches` | InboundLedgers.cpp:44 | Ledger fetch request count | +| `xrpld_ledger_history_mismatch` | LedgerHistory.cpp:16 | Ledger hash mismatch count | +| `xrpld_warn` | Logic.h:33 | Resource manager warning count | +| `xrpld_drop` | Logic.h:34 | Resource manager drop count | #### Histograms (from StatsD timers) -| Prometheus Metric | Source | Description | -| ----------------------- | --------------------- | ------------------------------ | -| `rippled_rpc_time` | ServerHandler.cpp:110 | RPC response time (ms) | -| `rippled_rpc_size` | ServerHandler.cpp:109 | RPC response size (bytes) | -| `rippled_ios_latency` | Application.cpp:438 | I/O service loop latency (ms) | -| `rippled_pathfind_fast` | PathRequests.h:23 | Fast pathfinding duration (ms) | -| `rippled_pathfind_full` | PathRequests.h:24 | Full pathfinding duration (ms) | +| Prometheus Metric | Source | Description | +| --------------------- | --------------------- | ------------------------------ | +| `xrpld_rpc_time` | ServerHandler.cpp:110 | RPC response time (ms) | +| `xrpld_rpc_size` | ServerHandler.cpp:109 | RPC response size (bytes) | +| `xrpld_ios_latency` | Application.cpp:438 | I/O service loop latency (ms) | +| `xrpld_pathfind_fast` | PathRequests.h:23 | Fast pathfinding duration (ms) | +| `xrpld_pathfind_full` | PathRequests.h:24 | Full pathfinding duration (ms) | ## Grafana Dashboards @@ -401,52 +401,52 @@ Requires `trace_peer=1` in the `[telemetry]` config section. ### Node Health -- StatsD (`xrpld-statsd-node-health`) -| Panel | Type | PromQL | Labels Used | -| -------------------------------------- | ---------- | ----------------------------------------------------------------- | ----------- | -| Validated Ledger Age | stat | `rippled_LedgerMaster_Validated_Ledger_Age` | — | -| Published Ledger Age | stat | `rippled_LedgerMaster_Published_Ledger_Age` | — | -| Operating Mode Duration | timeseries | `rippled_State_Accounting_*_duration` | — | -| Operating Mode Transitions | timeseries | `rippled_State_Accounting_*_transitions` | — | -| I/O Latency | timeseries | `histogram_quantile(0.95, rippled_ios_latency_bucket)` | — | -| Job Queue Depth | timeseries | `rippled_job_count` | — | -| Ledger Fetch Rate | stat | `rate(rippled_ledger_fetches[5m])` | — | -| Ledger History Mismatches | stat | `rate(rippled_ledger_history_mismatch[5m])` | — | -| Key Jobs Execution Time | timeseries | `rippled_acceptLedger{quantile="$quantile"}` (+ 10 more key jobs) | `quantile` | -| Key Jobs Dequeue Wait Time | timeseries | `rippled_acceptLedger_q{quantile="$quantile"}` (+ 10 more) | `quantile` | -| FullBelowCache Size | timeseries | `rippled_Node_family_full_below_cache_size` | — | -| FullBelowCache Hit Rate | gauge | `rippled_Node_family_full_below_cache_hit_rate` | — | -| Ledger Publish Gap | stat | `Published_Ledger_Age - Validated_Ledger_Age` | — | -| State Duration Rate (Full vs Tracking) | timeseries | `rate(rippled_State_Accounting_Full_duration[5m]) / 1000000` | — | -| All Jobs Execution Time (Detail) | timeseries | `{__name__=~"rippled_", quantile="$quantile"}` | `quantile` | -| All Jobs Dequeue Wait (Detail) | timeseries | `{__name__=~"rippled__q", quantile="$quantile"}` | `quantile` | +| Panel | Type | PromQL | Labels Used | +| -------------------------------------- | ---------- | --------------------------------------------------------------- | ----------- | +| Validated Ledger Age | stat | `xrpld_LedgerMaster_Validated_Ledger_Age` | — | +| Published Ledger Age | stat | `xrpld_LedgerMaster_Published_Ledger_Age` | — | +| Operating Mode Duration | timeseries | `xrpld_State_Accounting_*_duration` | — | +| Operating Mode Transitions | timeseries | `xrpld_State_Accounting_*_transitions` | — | +| I/O Latency | timeseries | `histogram_quantile(0.95, xrpld_ios_latency_bucket)` | — | +| Job Queue Depth | timeseries | `xrpld_job_count` | — | +| Ledger Fetch Rate | stat | `rate(xrpld_ledger_fetches[5m])` | — | +| Ledger History Mismatches | stat | `rate(xrpld_ledger_history_mismatch[5m])` | — | +| Key Jobs Execution Time | timeseries | `xrpld_acceptLedger{quantile="$quantile"}` (+ 10 more key jobs) | `quantile` | +| Key Jobs Dequeue Wait Time | timeseries | `xrpld_acceptLedger_q{quantile="$quantile"}` (+ 10 more) | `quantile` | +| FullBelowCache Size | timeseries | `xrpld_Node_family_full_below_cache_size` | — | +| FullBelowCache Hit Rate | gauge | `xrpld_Node_family_full_below_cache_hit_rate` | — | +| Ledger Publish Gap | stat | `Published_Ledger_Age - Validated_Ledger_Age` | — | +| State Duration Rate (Full vs Tracking) | timeseries | `rate(xrpld_State_Accounting_Full_duration[5m]) / 1000000` | — | +| All Jobs Execution Time (Detail) | timeseries | `{__name__=~"xrpld_", quantile="$quantile"}` | `quantile` | +| All Jobs Dequeue Wait (Detail) | timeseries | `{__name__=~"xrpld__q", quantile="$quantile"}` | `quantile` | ### Network Traffic -- StatsD (`xrpld-statsd-network`) -| Panel | Type | PromQL | Labels Used | -| ------------------------------------ | ---------- | -------------------------------------------- | ----------- | -| Active Peers | timeseries | `rippled_Peer_Finder_Active_*_Peers` | — | -| Peer Disconnects | timeseries | `rippled_Overlay_Peer_Disconnects` | — | -| Total Network Bytes | timeseries | `rate(rippled_total_Bytes_In/Out[5m])` | — | -| Total Network Messages | timeseries | `rippled_total_Messages_In/Out` | — | -| Transaction Traffic | timeseries | `rippled_transactions_Messages_In/Out` | — | -| Proposal Traffic | timeseries | `rippled_proposals_Messages_In/Out` | — | -| Validation Traffic | timeseries | `rippled_validations_Messages_In/Out` | — | -| Traffic by Category | bargauge | `topk(10, rippled_*_Bytes_In)` | — | -| Duplicate Traffic (Wasted Bandwidth) | timeseries | `rate(rippled_*_duplicate_Bytes_In/Out[5m])` | — | -| All Traffic Categories (Detail) | timeseries | `topk(15, rate(rippled_*_Bytes_In[5m]))` | — | +| Panel | Type | PromQL | Labels Used | +| ------------------------------------ | ---------- | ------------------------------------------ | ----------- | +| Active Peers | timeseries | `xrpld_Peer_Finder_Active_*_Peers` | — | +| Peer Disconnects | timeseries | `xrpld_Overlay_Peer_Disconnects` | — | +| Total Network Bytes | timeseries | `rate(xrpld_total_Bytes_In/Out[5m])` | — | +| Total Network Messages | timeseries | `xrpld_total_Messages_In/Out` | — | +| Transaction Traffic | timeseries | `xrpld_transactions_Messages_In/Out` | — | +| Proposal Traffic | timeseries | `xrpld_proposals_Messages_In/Out` | — | +| Validation Traffic | timeseries | `xrpld_validations_Messages_In/Out` | — | +| Traffic by Category | bargauge | `topk(10, xrpld_*_Bytes_In)` | — | +| Duplicate Traffic (Wasted Bandwidth) | timeseries | `rate(xrpld_*_duplicate_Bytes_In/Out[5m])` | — | +| All Traffic Categories (Detail) | timeseries | `topk(15, rate(xrpld_*_Bytes_In[5m]))` | — | ### RPC & Pathfinding -- StatsD (`xrpld-statsd-rpc`) -| Panel | Type | PromQL | Labels Used | -| ------------------------- | ---------- | -------------------------------------------------------- | ----------- | -| RPC Request Rate | stat | `rate(rippled_rpc_requests[5m])` | — | -| RPC Response Time | timeseries | `histogram_quantile(0.95, rippled_rpc_time_bucket)` | — | -| RPC Response Size | timeseries | `histogram_quantile(0.95, rippled_rpc_size_bucket)` | — | -| RPC Response Time Heatmap | heatmap | `rippled_rpc_time_bucket` | — | -| Pathfinding Fast Duration | timeseries | `histogram_quantile(0.95, rippled_pathfind_fast_bucket)` | — | -| Pathfinding Full Duration | timeseries | `histogram_quantile(0.95, rippled_pathfind_full_bucket)` | — | -| Resource Warnings Rate | stat | `rate(rippled_warn[5m])` | — | -| Resource Drops Rate | stat | `rate(rippled_drop[5m])` | — | +| Panel | Type | PromQL | Labels Used | +| ------------------------- | ---------- | ------------------------------------------------------ | ----------- | +| RPC Request Rate | stat | `rate(xrpld_rpc_requests[5m])` | — | +| RPC Response Time | timeseries | `histogram_quantile(0.95, xrpld_rpc_time_bucket)` | — | +| RPC Response Size | timeseries | `histogram_quantile(0.95, xrpld_rpc_size_bucket)` | — | +| RPC Response Time Heatmap | heatmap | `xrpld_rpc_time_bucket` | — | +| Pathfinding Fast Duration | timeseries | `histogram_quantile(0.95, xrpld_pathfind_fast_bucket)` | — | +| Pathfinding Full Duration | timeseries | `histogram_quantile(0.95, xrpld_pathfind_full_bucket)` | — | +| Resource Warnings Rate | stat | `rate(xrpld_warn[5m])` | — | +| Resource Drops Rate | stat | `rate(xrpld_drop[5m])` | — | ### Span → Metric → Dashboard Summary diff --git a/src/xrpld/app/ledger/detail/BuildLedger.cpp b/src/xrpld/app/ledger/detail/BuildLedger.cpp index d7221e2c21..95f72bde15 100644 --- a/src/xrpld/app/ledger/detail/BuildLedger.cpp +++ b/src/xrpld/app/ledger/detail/BuildLedger.cpp @@ -14,13 +14,15 @@ #include #include #include -#include #include #include #include +#include #include +#include #include +#include #include #include #include diff --git a/src/xrpld/app/ledger/detail/LedgerMaster.cpp b/src/xrpld/app/ledger/detail/LedgerMaster.cpp index df62dc36f1..0305ce7c4e 100644 --- a/src/xrpld/app/ledger/detail/LedgerMaster.cpp +++ b/src/xrpld/app/ledger/detail/LedgerMaster.cpp @@ -57,6 +57,7 @@ #include #include #include +#include #include diff --git a/src/xrpld/app/ledger/detail/LedgerSpanNames.h b/src/xrpld/app/ledger/detail/LedgerSpanNames.h index f6b5af6c51..4d24a60b2e 100644 --- a/src/xrpld/app/ledger/detail/LedgerSpanNames.h +++ b/src/xrpld/app/ledger/detail/LedgerSpanNames.h @@ -15,9 +15,7 @@ #include -namespace xrpl { -namespace telemetry { -namespace ledger_span { +namespace xrpl::telemetry::ledger_span { // ===== Span operation suffixes =============================================== @@ -49,6 +47,4 @@ inline constexpr auto txFailed = join(xrplLedger, makeStr("tx_failed")); inline constexpr auto validations = join(xrplLedger, makeStr("validations")); } // namespace attr -} // namespace ledger_span -} // namespace telemetry -} // namespace xrpl +} // namespace xrpl::telemetry::ledger_span diff --git a/src/xrpld/overlay/detail/PeerImp.cpp b/src/xrpld/overlay/detail/PeerImp.cpp index ef3d456055..888aa1b8cc 100644 --- a/src/xrpld/overlay/detail/PeerImp.cpp +++ b/src/xrpld/overlay/detail/PeerImp.cpp @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -1966,17 +1967,16 @@ PeerImp::onMessage(std::shared_ptr const& m) app_.getTimeKeeper().closeTime(), calcNodeID(app_.getValidatorManifests().getMasterKey(publicKey))}); - // Create a receive span that links to the sender's trace context - // (if propagated). shared_ptr keeps it alive across the job boundary. - auto span = std::make_shared(telemetry::proposalReceiveSpan(set)); - span->setAttribute(telemetry::cons_span::attr::trusted, isTrusted); - span->setAttribute(telemetry::cons_span::attr::round, static_cast(set.proposeseq())); + auto consSpan = std::make_shared(telemetry::proposalReceiveSpan(set)); + consSpan->setAttribute(telemetry::cons_span::attr::trusted, isTrusted); + consSpan->setAttribute( + telemetry::cons_span::attr::round, static_cast(set.proposeseq())); std::weak_ptr const weak = shared_from_this(); app_.getJobQueue().addJob( isTrusted ? jtPROPOSAL_t : jtPROPOSAL_ut, "checkPropose", - [weak, isTrusted, m, proposal, sp = std::move(span)]() { + [weak, isTrusted, m, proposal, sp = std::move(consSpan)]() { if (auto peer = weak.lock()) peer->checkPropose(isTrusted, m, proposal); }); @@ -2560,13 +2560,12 @@ PeerImp::onMessage(std::shared_ptr const& m) return; } - // Create a receive span that links to the sender's trace context - // (if propagated). shared_ptr keeps it alive across the job boundary. - auto span = std::make_shared(telemetry::validationReceiveSpan(*m)); - span->setAttribute(telemetry::cons_span::attr::trusted, isTrusted); + auto consSpan = + std::make_shared(telemetry::validationReceiveSpan(*m)); + consSpan->setAttribute(telemetry::cons_span::attr::trusted, isTrusted); if (val->isFieldPresent(sfLedgerSequence)) { - span->setAttribute( + consSpan->setAttribute( telemetry::cons_span::attr::ledgerSeq, static_cast(val->getFieldU32(sfLedgerSequence))); } @@ -2583,7 +2582,7 @@ PeerImp::onMessage(std::shared_ptr const& m) app_.getJobQueue().addJob( isTrusted ? jtVALIDATION_t : jtVALIDATION_ut, name, - [weak, val, m, key, sp = std::move(span)]() { + [weak, val, m, key, sp = std::move(consSpan)]() { if (auto peer = weak.lock()) peer->checkValidation(val, key, m); }); diff --git a/src/xrpld/overlay/detail/PeerSpanNames.h b/src/xrpld/overlay/detail/PeerSpanNames.h index cbeeed528b..9697ea3fa4 100644 --- a/src/xrpld/overlay/detail/PeerSpanNames.h +++ b/src/xrpld/overlay/detail/PeerSpanNames.h @@ -13,9 +13,7 @@ #include -namespace xrpl { -namespace telemetry { -namespace peer_span { +namespace xrpl::telemetry::peer_span { // ===== Span operation suffixes =============================================== @@ -45,6 +43,4 @@ inline constexpr auto validationTrusted = join(join(xrplPeer, makeStr("validation")), makeStr("trusted")); } // namespace attr -} // namespace peer_span -} // namespace telemetry -} // namespace xrpl +} // namespace xrpl::telemetry::peer_span