From b65f91117f00e0a0e45686f8589f60ef38373f02 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 29 Apr 2026 20:03:22 +0100 Subject: [PATCH] fix: address CI checks (prettier, docs.sh rename, levelization) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Prettier formatting for markdown docs and OTelCollector header - docs.sh rippled→xrpld renames in OTelCollector.cpp comments/strings - Updated levelization ordering with new dependency edges Co-Authored-By: Claude Opus 4.6 (1M context) --- .../scripts/levelization/results/ordering.txt | 3 + .../09-data-collection-reference.md | 190 ++++----- OpenTelemetryPlan/Phase7_taskList.md | 62 +-- ...-03-30-external-dashboard-parity-design.md | 381 +++++++++--------- docs/telemetry-runbook.md | 134 +++--- include/xrpl/beast/insight/OTelCollector.h | 2 +- src/libxrpl/beast/insight/OTelCollector.cpp | 23 +- 7 files changed, 408 insertions(+), 387 deletions(-) diff --git a/.github/scripts/levelization/results/ordering.txt b/.github/scripts/levelization/results/ordering.txt index 775645a53b..5a2307b1be 100644 --- a/.github/scripts/levelization/results/ordering.txt +++ b/.github/scripts/levelization/results/ordering.txt @@ -103,6 +103,7 @@ test.csf > xrpld.consensus test.csf > xrpl.json test.csf > xrpl.ledger test.csf > xrpl.protocol +test.csf > xrpl.telemetry test.json > test.jtx test.json > xrpl.json test.jtx > xrpl.basics @@ -190,6 +191,7 @@ test.toplevel > xrpl.json test.unit_test > xrpl.basics test.unit_test > xrpl.protocol tests.libxrpl > xrpl.basics +tests.libxrpl > xrpld.telemetry tests.libxrpl > xrpl.json tests.libxrpl > xrpl.net tests.libxrpl > xrpl.protocol @@ -300,4 +302,5 @@ xrpld.shamap > xrpld.core xrpld.shamap > xrpl.protocol xrpld.shamap > xrpl.shamap xrpld.telemetry > xrpl.basics +xrpld.telemetry > xrpl.protocol xrpld.telemetry > xrpl.telemetry diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md index 4a5807f884..33f9e7810d 100644 --- a/OpenTelemetryPlan/09-data-collection-reference.md +++ b/OpenTelemetryPlan/09-data-collection-reference.md @@ -1,6 +1,6 @@ # Observability Data Collection Reference -> **Audience**: Developers and operators. This is the single source of truth for all telemetry data collected by rippled's observability stack. +> **Audience**: Developers and operators. This is the single source of truth for all telemetry data collected by xrpld's observability stack. > > **Related docs**: [docs/telemetry-runbook.md](../docs/telemetry-runbook.md) (operator runbook with alerting and troubleshooting) | [03-implementation-strategy.md](./03-implementation-strategy.md) (code structure and performance optimization) | [04-code-samples.md](./04-code-samples.md) (C++ instrumentation examples) @@ -8,7 +8,7 @@ ```mermaid graph LR - subgraph rippledNode["rippled Node"] + subgraph xrpldNode["xrpld Node"] A["Trace Macros
XRPL_TRACE_SPAN
(OTLP/HTTP exporter)"] B["beast::insight
OTel native metrics
(OTLP/HTTP exporter)"] end @@ -41,7 +41,7 @@ graph LR BP -->|"OTLP/gRPC :4317"| D SM -->|"span_calls_total
span_duration_ms
(6 dimension labels)"| E - R1 -->|"rippled_* gauges
rippled_* counters
rippled_* histograms"| E + R1 -->|"xrpld_* gauges
xrpld_* counters
xrpld_* histograms"| E E -->|"Prometheus
data source"| F D -->|"Tempo
data source"| F @@ -54,7 +54,7 @@ graph LR style D fill:#f0ad4e,color:#000,stroke:#c78c2e style E fill:#f0ad4e,color:#000,stroke:#c78c2e style F fill:#5bc0de,color:#000,stroke:#3aa8c1 - style rippledNode fill:#1a2633,color:#ccc,stroke:#4a90d9 + style xrpldNode fill:#1a2633,color:#ccc,stroke:#4a90d9 style collector fill:#1a3320,color:#ccc,stroke:#5cb85c style backends fill:#332a1a,color:#ccc,stroke:#f0ad4e style metrics fill:#332a1a,color:#ccc,stroke:#f0ad4e @@ -91,9 +91,9 @@ Controlled by `trace_rpc=1` in `[telemetry]` config. | `rpc.ws_message` | — | ServerHandler.cpp | WebSocket message handling | | `rpc.command.` | `rpc.process` | RPCHandler.cpp | Per-command span (e.g., `rpc.command.server_info`, `rpc.command.ledger`) | -**Where to find**: Tempo → TraceQL: `{resource.service.name="rippled" && name=~"rpc.request|rpc.command.*"}` +**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"rpc.request|rpc.command.*"}` -**Grafana dashboard**: _RPC Performance_ (`rippled-rpc-perf`) +**Grafana dashboard**: _RPC Performance_ (`xrpld-rpc-perf`) #### Transaction Spans @@ -105,9 +105,9 @@ Controlled by `trace_transactions=1` in `[telemetry]` config. | `tx.receive` | — | PeerImp.cpp | Raw transaction received from peer overlay (before deduplication) | | `tx.apply` | `ledger.build` | BuildLedger.cpp | Transaction set applied to new ledger during consensus | -**Where to find**: Tempo → TraceQL: `{resource.service.name="rippled" && name=~"tx.process|tx.receive"}` +**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"tx.process|tx.receive"}` -**Grafana dashboard**: _Transaction Overview_ (`rippled-transactions`) +**Grafana dashboard**: _Transaction Overview_ (`xrpld-transactions`) #### Consensus Spans @@ -121,9 +121,9 @@ Controlled by `trace_consensus=1` in `[telemetry]` config. | `consensus.validation.send` | — | RCLConsensus.cpp | Validation message sent after ledger accepted | | `consensus.accept.apply` | — | RCLConsensus.cpp | Ledger application with close time details | -**Where to find**: Tempo → TraceQL: `{resource.service.name="rippled" && name=~"consensus.*"}` +**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"consensus.*"}` -**Grafana dashboard**: _Consensus Health_ (`rippled-consensus`) +**Grafana dashboard**: _Consensus Health_ (`xrpld-consensus`) #### Ledger Spans @@ -135,9 +135,9 @@ Controlled by `trace_ledger=1` in `[telemetry]` config. | `ledger.validate` | — | LedgerMaster.cpp | Ledger promoted to validated status | | `ledger.store` | — | LedgerMaster.cpp | Ledger stored to database/history | -**Where to find**: Tempo → TraceQL: `{resource.service.name="rippled" && name=~"ledger.*"}` +**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"ledger.*"}` -**Grafana dashboard**: _Ledger Operations_ (`rippled-ledger-ops`) +**Grafana dashboard**: _Ledger Operations_ (`xrpld-ledger-ops`) #### Peer Spans @@ -148,9 +148,9 @@ Controlled by `trace_peer=1` in `[telemetry]` config. **Disabled by default** (h | `peer.proposal.receive` | — | PeerImp.cpp | Consensus proposal received from peer | | `peer.validation.receive` | — | PeerImp.cpp | Validation message received from peer | -**Where to find**: Tempo → TraceQL: `{resource.service.name="rippled" && name=~"peer.*"}` +**Where to find**: Tempo → TraceQL: `{resource.service.name="xrpld" && name=~"peer.*"}` -**Grafana dashboard**: _Peer Network_ (`rippled-peer-net`) +**Grafana dashboard**: _Peer Network_ (`xrpld-peer-net`) --- @@ -235,7 +235,7 @@ Every span can carry key-value attributes that provide context for filtering and > **See also**: [01-architecture-analysis.md](./01-architecture-analysis.md) §1.8.2 for how span-derived metrics map to operational insights. -The OTel Collector's SpanMetrics connector automatically generates RED (Rate, Errors, Duration) metrics from every span. No custom metrics code in rippled is needed. +The OTel Collector's SpanMetrics connector automatically generates RED (Rate, Errors, Duration) metrics from every span. No custom metrics code in xrpld is needed. | Prometheus Metric | Type | Description | | -------------------------------------------------- | --------- | ------------------------------------------------------------------------------ | @@ -267,7 +267,7 @@ The OTel Collector's SpanMetrics connector automatically generates RED (Rate, Er > > **Migration complete**: Phase 7 replaced the StatsD UDP transport with native OTel Metrics SDK export via OTLP/HTTP. The `beast::insight::Collector` interface and all metric names are preserved — only the wire protocol changed. `[insight] server=statsd` remains as a fallback. -These are system-level metrics emitted by rippled's `beast::insight` framework via OTel OTLP/HTTP. They cover operational data that doesn't map to individual trace spans. +These are system-level metrics emitted by xrpld's `beast::insight` framework via OTel OTLP/HTTP. They cover operational data that doesn't map to individual trace spans. ### Configuration @@ -276,7 +276,7 @@ These are system-level metrics emitted by rippled's `beast::insight` framework v [insight] server=otel endpoint=http://localhost:4318/v1/metrics -prefix=rippled +prefix=xrpld ``` Fallback (StatsD): @@ -285,56 +285,56 @@ Fallback (StatsD): [insight] server=statsd address=127.0.0.1:8125 -prefix=rippled +prefix=xrpld ``` ### 2.1 Gauges -| Prometheus Metric | Source File | Description | Typical Range | -| --------------------------------------------------- | --------------------- | ----------------------------------------- | ------------------------------- | -| `rippled_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h | Seconds since last validated ledger | 0–10 (healthy), >30 (stale) | -| `rippled_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h | Seconds since last published ledger | 0–10 (healthy) | -| `rippled_State_Accounting_Disconnected_duration` | NetworkOPs.cpp | Cumulative seconds in Disconnected state | Monotonic | -| `rippled_State_Accounting_Connected_duration` | NetworkOPs.cpp | Cumulative seconds in Connected state | Monotonic | -| `rippled_State_Accounting_Syncing_duration` | NetworkOPs.cpp | Cumulative seconds in Syncing state | Monotonic | -| `rippled_State_Accounting_Tracking_duration` | NetworkOPs.cpp | Cumulative seconds in Tracking state | Monotonic | -| `rippled_State_Accounting_Full_duration` | NetworkOPs.cpp | Cumulative seconds in Full state | Monotonic (should dominate) | -| `rippled_State_Accounting_Disconnected_transitions` | NetworkOPs.cpp | Count of transitions to Disconnected | Low | -| `rippled_State_Accounting_Connected_transitions` | NetworkOPs.cpp | Count of transitions to Connected | Low | -| `rippled_State_Accounting_Syncing_transitions` | NetworkOPs.cpp | Count of transitions to Syncing | Low | -| `rippled_State_Accounting_Tracking_transitions` | NetworkOPs.cpp | Count of transitions to Tracking | Low | -| `rippled_State_Accounting_Full_transitions` | NetworkOPs.cpp | Count of transitions to Full | Low (should be 1 after startup) | -| `rippled_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp | Active inbound peer connections | 0–85 | -| `rippled_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp | Active outbound peer connections | 10–21 | -| `rippled_Overlay_Peer_Disconnects` | OverlayImpl.cpp | Cumulative peer disconnection count | Low growth | -| `rippled_Overlay_Peer_Disconnects_Charges` | OverlayImpl.cpp | Disconnects due to resource limit charges | Low growth (subset of above) | -| `rippled_job_count` | JobQueue.cpp | Current job queue depth | 0–100 (healthy) | +| Prometheus Metric | Source File | Description | Typical Range | +| ------------------------------------------------- | --------------------- | ----------------------------------------- | ------------------------------- | +| `xrpld_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h | Seconds since last validated ledger | 0–10 (healthy), >30 (stale) | +| `xrpld_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h | Seconds since last published ledger | 0–10 (healthy) | +| `xrpld_State_Accounting_Disconnected_duration` | NetworkOPs.cpp | Cumulative seconds in Disconnected state | Monotonic | +| `xrpld_State_Accounting_Connected_duration` | NetworkOPs.cpp | Cumulative seconds in Connected state | Monotonic | +| `xrpld_State_Accounting_Syncing_duration` | NetworkOPs.cpp | Cumulative seconds in Syncing state | Monotonic | +| `xrpld_State_Accounting_Tracking_duration` | NetworkOPs.cpp | Cumulative seconds in Tracking state | Monotonic | +| `xrpld_State_Accounting_Full_duration` | NetworkOPs.cpp | Cumulative seconds in Full state | Monotonic (should dominate) | +| `xrpld_State_Accounting_Disconnected_transitions` | NetworkOPs.cpp | Count of transitions to Disconnected | Low | +| `xrpld_State_Accounting_Connected_transitions` | NetworkOPs.cpp | Count of transitions to Connected | Low | +| `xrpld_State_Accounting_Syncing_transitions` | NetworkOPs.cpp | Count of transitions to Syncing | Low | +| `xrpld_State_Accounting_Tracking_transitions` | NetworkOPs.cpp | Count of transitions to Tracking | Low | +| `xrpld_State_Accounting_Full_transitions` | NetworkOPs.cpp | Count of transitions to Full | Low (should be 1 after startup) | +| `xrpld_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp | Active inbound peer connections | 0–85 | +| `xrpld_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp | Active outbound peer connections | 10–21 | +| `xrpld_Overlay_Peer_Disconnects` | OverlayImpl.cpp | Cumulative peer disconnection count | Low growth | +| `xrpld_Overlay_Peer_Disconnects_Charges` | OverlayImpl.cpp | Disconnects due to resource limit charges | Low growth (subset of above) | +| `xrpld_job_count` | JobQueue.cpp | Current job queue depth | 0–100 (healthy) | -**Grafana dashboard**: _Node Health (System Metrics)_ (`rippled-system-node-health`) +**Grafana dashboard**: _Node Health (System Metrics)_ (`xrpld-system-node-health`) ### 2.2 Counters -| Prometheus Metric | Source File | Description | -| --------------------------------- | ------------------ | --------------------------------------------- | -| `rippled_rpc_requests` | ServerHandler.cpp | Total RPC requests received | -| `rippled_ledger_fetches` | InboundLedgers.cpp | Inbound ledger fetch attempts | -| `rippled_ledger_history_mismatch` | LedgerHistory.cpp | Ledger hash mismatches detected | -| `rippled_warn` | Logic.h | Resource manager warnings issued | -| `rippled_drop` | Logic.h | Resource manager drops (connections rejected) | +| Prometheus Metric | Source File | Description | +| ------------------------------- | ------------------ | --------------------------------------------- | +| `xrpld_rpc_requests` | ServerHandler.cpp | Total RPC requests received | +| `xrpld_ledger_fetches` | InboundLedgers.cpp | Inbound ledger fetch attempts | +| `xrpld_ledger_history_mismatch` | LedgerHistory.cpp | Ledger hash mismatches detected | +| `xrpld_warn` | Logic.h | Resource manager warnings issued | +| `xrpld_drop` | Logic.h | Resource manager drops (connections rejected) | -**Note**: With `server=otel`, `rippled_warn` and `rippled_drop` are properly exported as OTel Counter instruments. The previous StatsD `|m` type limitation no longer applies. +**Note**: With `server=otel`, `xrpld_warn` and `xrpld_drop` are properly exported as OTel Counter instruments. The previous StatsD `|m` type limitation no longer applies. -**Grafana dashboard**: _RPC & Pathfinding (System Metrics)_ (`rippled-system-rpc`) +**Grafana dashboard**: _RPC & Pathfinding (System Metrics)_ (`xrpld-system-rpc`) ### 2.3 Histograms (Event timers) -| Prometheus Metric | Source File | Unit | Description | -| ----------------------- | ----------------- | ----- | ------------------------------ | -| `rippled_rpc_time` | ServerHandler.cpp | ms | RPC response time distribution | -| `rippled_rpc_size` | ServerHandler.cpp | bytes | RPC response size distribution | -| `rippled_ios_latency` | Application.cpp | ms | I/O service loop latency | -| `rippled_pathfind_fast` | PathRequests.h | ms | Fast pathfinding duration | -| `rippled_pathfind_full` | PathRequests.h | ms | Full pathfinding duration | +| Prometheus Metric | Source File | Unit | Description | +| --------------------- | ----------------- | ----- | ------------------------------ | +| `xrpld_rpc_time` | ServerHandler.cpp | ms | RPC response time distribution | +| `xrpld_rpc_size` | ServerHandler.cpp | bytes | RPC response size distribution | +| `xrpld_ios_latency` | Application.cpp | ms | I/O service loop latency | +| `xrpld_pathfind_fast` | PathRequests.h | ms | Fast pathfinding duration | +| `xrpld_pathfind_full` | PathRequests.h | ms | Full pathfinding duration | Quantiles collected: 0th, 50th, 90th, 95th, 99th, 100th percentile. @@ -344,10 +344,10 @@ Quantiles collected: 0th, 50th, 90th, 95th, 99th, 100th percentile. For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), four gauges are emitted: -- `rippled_{category}_Bytes_In` -- `rippled_{category}_Bytes_Out` -- `rippled_{category}_Messages_In` -- `rippled_{category}_Messages_Out` +- `xrpld_{category}_Bytes_In` +- `xrpld_{category}_Bytes_Out` +- `xrpld_{category}_Messages_In` +- `xrpld_{category}_Messages_Out` **Key categories**: @@ -366,7 +366,7 @@ For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), fo | `ping` / `status` | Keepalive and status | | `set_get` | Set requests | -**Grafana dashboards**: _Network Traffic_ (`rippled-system-network`), _Overlay Traffic Detail_ (`rippled-system-overlay-detail`), _Ledger Data & Sync_ (`rippled-system-ledger-sync`) +**Grafana dashboards**: _Network Traffic_ (`xrpld-system-network`), _Overlay Traffic Detail_ (`xrpld-system-overlay-detail`), _Ledger Data & Sync_ (`xrpld-system-ledger-sync`) --- @@ -376,28 +376,28 @@ For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), fo ### 3.1 Span-Derived Dashboards (5) -| Dashboard | UID | Data Source | Key Panels | -| -------------------- | ---------------------- | ------------------------ | ---------------------------------------------------------------------------------- | -| RPC Performance | `rippled-rpc-perf` | Prometheus (SpanMetrics) | Request rate by command, p95 latency by command, error rate, heatmap, top commands | -| Transaction Overview | `rippled-transactions` | Prometheus (SpanMetrics) | Processing rate, latency p95/p50, local vs relay split, apply duration, heatmap | -| Consensus Health | `rippled-consensus` | Prometheus (SpanMetrics) | Round duration p95/p50, proposals rate, close duration, mode timeline, heatmap | -| Ledger Operations | `rippled-ledger-ops` | Prometheus (SpanMetrics) | Build rate, build duration, validation rate, store rate, build vs close comparison | -| Peer Network | `rippled-peer-net` | Prometheus (SpanMetrics) | Proposal receive rate, validation receive rate, trusted vs untrusted breakdown | +| Dashboard | UID | Data Source | Key Panels | +| -------------------- | -------------------- | ------------------------ | ---------------------------------------------------------------------------------- | +| RPC Performance | `xrpld-rpc-perf` | Prometheus (SpanMetrics) | Request rate by command, p95 latency by command, error rate, heatmap, top commands | +| Transaction Overview | `xrpld-transactions` | Prometheus (SpanMetrics) | Processing rate, latency p95/p50, local vs relay split, apply duration, heatmap | +| Consensus Health | `xrpld-consensus` | Prometheus (SpanMetrics) | Round duration p95/p50, proposals rate, close duration, mode timeline, heatmap | +| Ledger Operations | `xrpld-ledger-ops` | Prometheus (SpanMetrics) | Build rate, build duration, validation rate, store rate, build vs close comparison | +| Peer Network | `xrpld-peer-net` | Prometheus (SpanMetrics) | Proposal receive rate, validation receive rate, trusted vs untrusted breakdown | ### 3.2 System Metrics Dashboards (5) -| Dashboard | UID | Data Source | Key Panels | -| ---------------------- | ------------------------------- | ----------------- | --------------------------------------------------------------------------------- | -| Node Health | `rippled-system-node-health` | Prometheus (OTLP) | Ledger age, operating mode, I/O latency, job queue, fetch rate | -| Network Traffic | `rippled-system-network` | Prometheus (OTLP) | Active peers, disconnects, bytes in/out, messages in/out, traffic by category | -| RPC & Pathfinding | `rippled-system-rpc` | Prometheus (OTLP) | RPC rate, response time/size, pathfinding duration, resource warnings/drops | -| Overlay Traffic Detail | `rippled-system-overlay-detail` | Prometheus (OTLP) | Squelch, overhead, validator lists, set get/share, have/requested tx, proof paths | -| Ledger Data & Sync | `rippled-system-ledger-sync` | Prometheus (OTLP) | Ledger data exchange, legacy ledger share/get, getobject by type, traffic heatmap | +| Dashboard | UID | Data Source | Key Panels | +| ---------------------- | ----------------------------- | ----------------- | --------------------------------------------------------------------------------- | +| Node Health | `xrpld-system-node-health` | Prometheus (OTLP) | Ledger age, operating mode, I/O latency, job queue, fetch rate | +| Network Traffic | `xrpld-system-network` | Prometheus (OTLP) | Active peers, disconnects, bytes in/out, messages in/out, traffic by category | +| RPC & Pathfinding | `xrpld-system-rpc` | Prometheus (OTLP) | RPC rate, response time/size, pathfinding duration, resource warnings/drops | +| Overlay Traffic Detail | `xrpld-system-overlay-detail` | Prometheus (OTLP) | Squelch, overhead, validator lists, set get/share, have/requested tx, proof paths | +| Ledger Data & Sync | `xrpld-system-ledger-sync` | Prometheus (OTLP) | Ledger data exchange, legacy ledger share/get, getobject by type, traffic heatmap | ### 3.3 Accessing the Dashboards 1. Open Grafana at **http://localhost:3000** -2. Navigate to **Dashboards → rippled** folder +2. Navigate to **Dashboards → xrpld** folder 3. All 10 dashboards are auto-provisioned from `docker/telemetry/grafana/dashboards/` --- @@ -408,18 +408,18 @@ For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), fo ### Finding Traces by Type -| What to Find | Tempo TraceQL Query | -| ------------------------ | -------------------------------------------------------------------------------- | -| All RPC calls | `{resource.service.name="rippled" && name="rpc.request"}` | -| Specific RPC command | `{resource.service.name="rippled" && name="rpc.command.server_info"}` | -| Slow RPC calls | `{resource.service.name="rippled" && name=~"rpc.command.*"} \| duration > 100ms` | -| Failed RPC calls | `{span.xrpl.rpc.status="error"}` | -| Specific transaction | `{span.xrpl.tx.hash=""}` | -| Local transactions only | `{span.xrpl.tx.local=true}` | -| Consensus rounds | `{resource.service.name="rippled" && name="consensus.accept"}` | -| Rounds by mode | `{span.xrpl.consensus.mode="proposing"}` | -| Specific ledger | `{span.xrpl.ledger.seq=12345}` | -| Peer proposals (trusted) | `{span.xrpl.peer.proposal.trusted=true}` | +| What to Find | Tempo TraceQL Query | +| ------------------------ | ------------------------------------------------------------------------------ | +| All RPC calls | `{resource.service.name="xrpld" && name="rpc.request"}` | +| Specific RPC command | `{resource.service.name="xrpld" && name="rpc.command.server_info"}` | +| Slow RPC calls | `{resource.service.name="xrpld" && name=~"rpc.command.*"} \| duration > 100ms` | +| Failed RPC calls | `{span.xrpl.rpc.status="error"}` | +| Specific transaction | `{span.xrpl.tx.hash=""}` | +| Local transactions only | `{span.xrpl.tx.local=true}` | +| Consensus rounds | `{resource.service.name="xrpld" && name="consensus.accept"}` | +| Rounds by mode | `{span.xrpl.consensus.mode="proposing"}` | +| Specific ledger | `{span.xrpl.ledger.seq=12345}` | +| Peer proposals (trusted) | `{span.xrpl.peer.proposal.trusted=true}` | ### Trace Structure @@ -473,19 +473,19 @@ sum by (xrpl_peer_proposal_trusted) (rate(traces_span_metrics_calls_total{span_n ```promql # Validated ledger age (should be < 10s) -rippled_LedgerMaster_Validated_Ledger_Age +xrpld_LedgerMaster_Validated_Ledger_Age # Active peer count -rippled_Peer_Finder_Active_Inbound_Peers + rippled_Peer_Finder_Active_Outbound_Peers +xrpld_Peer_Finder_Active_Inbound_Peers + xrpld_Peer_Finder_Active_Outbound_Peers # RPC response time p95 -histogram_quantile(0.95, rippled_rpc_time_bucket) +histogram_quantile(0.95, xrpld_rpc_time_bucket) # Total network bytes in (rate) -rate(rippled_total_Bytes_In[5m]) +rate(xrpld_total_Bytes_In[5m]) # Operating mode (should be "Full" after startup) -rippled_State_Accounting_Full_duration +xrpld_State_Accounting_Full_duration ``` --- @@ -495,8 +495,8 @@ rippled_State_Accounting_Full_duration | Issue | Impact | Status | | ------------------------------------------------------------------ | ------------------------------------------------ | -------------------------------------------------------------------- | | `warn` and `drop` metrics use non-standard StatsD `\|m` meter type | Metrics silently dropped by OTel StatsD receiver | Phase 6 Task 6.1 — needs `\|m` → `\|c` change in StatsDCollector.cpp | -| `rippled_job_count` may not emit in standalone mode | Missing from Prometheus in some test configs | Requires active job queue activity | -| `rippled_rpc_requests` depends on `[insight]` config | Zero series if StatsD not configured | Requires `[insight] server=statsd` in xrpld.cfg | +| `xrpld_job_count` may not emit in standalone mode | Missing from Prometheus in some test configs | Requires active job queue activity | +| `xrpld_rpc_requests` depends on `[insight]` config | Zero series if StatsD not configured | Requires `[insight] server=statsd` in xrpld.cfg | | Peer tracing disabled by default | No `peer.*` spans unless `trace_peer=1` | Intentional — high volume on mainnet | --- @@ -528,7 +528,7 @@ enabled=1 [insight] server=statsd address=127.0.0.1:8125 -prefix=rippled +prefix=xrpld ``` ### Production Setup @@ -545,7 +545,7 @@ max_queue_size=4096 [insight] server=statsd address=otel-collector:8125 -prefix=rippled +prefix=xrpld ``` ### Trace Category Toggle diff --git a/OpenTelemetryPlan/Phase7_taskList.md b/OpenTelemetryPlan/Phase7_taskList.md index 2bd93f8131..28463dad38 100644 --- a/OpenTelemetryPlan/Phase7_taskList.md +++ b/OpenTelemetryPlan/Phase7_taskList.md @@ -130,7 +130,7 @@ - Edit `docker/telemetry/docker-compose.yml`: - Remove UDP :8125 port mapping from otel-collector service - - Update rippled service config: change `[insight] server=statsd` to `server=otel` + - Update xrpld service config: change `[insight] server=statsd` to `server=otel` **Key modified files**: @@ -148,14 +148,14 @@ **What to do**: - In `OTelCollector.cpp`, construct OTel instrument names to match existing Prometheus metric names: - - beast::insight `make_gauge("LedgerMaster", "Validated_Ledger_Age")` → OTel instrument name: `rippled_LedgerMaster_Validated_Ledger_Age` + - beast::insight `make_gauge("LedgerMaster", "Validated_Ledger_Age")` → OTel instrument name: `xrpld_LedgerMaster_Validated_Ledger_Age` - The prefix + group + name concatenation must produce the same string as `StatsDCollector`'s format - Use underscores as separators (matching StatsD convention) - Verify in integration test that key Prometheus queries still return data: - - `rippled_LedgerMaster_Validated_Ledger_Age` - - `rippled_Peer_Finder_Active_Inbound_Peers` - - `rippled_rpc_requests` + - `xrpld_LedgerMaster_Validated_Ledger_Age` + - `xrpld_Peer_Finder_Active_Inbound_Peers` + - `xrpld_rpc_requests` **Key consideration**: OTel Prometheus exporter may normalize metric names differently than StatsD receiver. Test this early (Task 7.2) and adjust naming strategy if needed. The OTel SDK's Prometheus exporter adds `_total` suffix to counters and converts dots to underscores — match existing conventions. @@ -321,7 +321,7 @@ struct WindowEvent { ```cpp validatorHealthGauge_ = meter_->CreateDoubleObservableGauge( - "rippled_validator_health", "Validator health indicators"); + "xrpld_validator_health", "Validator health indicators"); ``` **Gauge label values**: @@ -346,7 +346,7 @@ candidates for removal from the UNL. ```cpp validatorParticipationGauge_ = meter_->CreateInt64ObservableGauge( - "rippled_validator_participation", + "xrpld_validator_participation", "Per-validator validation count over the last 256 ledgers"); ``` @@ -369,7 +369,7 @@ validatorParticipationGauge_ = meter_->CreateInt64ObservableGauge( for each. The UNL list is from `app_.getValidators().getTrustedMasterKeys()`. - **Dashboard panel**: Add a table panel to the Validator Health dashboard - showing `rippled_validator_participation` grouped by `validator` label, + showing `xrpld_validator_participation` grouped by `validator` label, with a threshold color (green >= 240, yellow >= 200, red < 200). **Key modified files**: `src/xrpld/telemetry/MetricsRegistry.h/.cpp` @@ -413,7 +413,7 @@ validatorParticipationGauge_ = meter_->CreateInt64ObservableGauge( | -------------------------- | ------ | ------------------------------------- | | `peer_latency_p90_ms` | double | P90 from sorted peer latencies | | `peers_insane_count` | int64 | Peers with diverged tracking status | -| `peers_higher_version_pct` | double | % of peers on newer rippled version | +| `peers_higher_version_pct` | double | % of peers on newer xrpld version | | `upgrade_recommended` | int64 | 1 if `peers_higher_version_pct > 60%` | **Implementation note**: The callback runs every 10s on the metrics reader thread. Iterating ~50-200 peers is acceptable overhead. @@ -424,7 +424,7 @@ validatorParticipationGauge_ = meter_->CreateInt64ObservableGauge( - [ ] P90 latency computed correctly - [ ] Insane count matches `peers` RPC output -- [ ] Version comparison handles format variations (e.g., "rippled-2.4.0-rc1") +- [ ] Version comparison handles format variations (e.g., "xrpld-2.4.0-rc1") --- @@ -486,10 +486,10 @@ validatorParticipationGauge_ = meter_->CreateInt64ObservableGauge( **Gauge label values**: -| Gauge Name | Label `metric=` | Type | Source | -| ------------------------ | ------------------------------- | ------ | ----------------------------- | -| `rippled_storage_detail` | `nudb_bytes` | int64 | NuDB backend file size | -| `rippled_sync_info` | `initial_sync_duration_seconds` | double | Time from start to first FULL | +| Gauge Name | Label `metric=` | Type | Source | +| ---------------------- | ------------------------------- | ------ | ----------------------------- | +| `xrpld_storage_detail` | `nudb_bytes` | int64 | NuDB backend file size | +| `xrpld_sync_info` | `initial_sync_duration_seconds` | double | Time from start to first FULL | **Key modified files**: `src/xrpld/telemetry/MetricsRegistry.h/.cpp` @@ -506,15 +506,15 @@ validatorParticipationGauge_ = meter_->CreateInt64ObservableGauge( **Objective**: Add 7 new event counters incremented at their respective instrumentation sites. -| Counter Name | Increment Site | Source File | -| ------------------------------------- | -------------------------------- | --------------------- | -| `rippled_ledgers_closed_total` | `onAccept()` in consensus | RCLConsensus.cpp | -| `rippled_validations_sent_total` | `validate()` in consensus | RCLConsensus.cpp | -| `rippled_validations_checked_total` | Network validation received | LedgerMaster.cpp | -| `rippled_validation_agreements_total` | ValidationTracker reconciliation | ValidationTracker.cpp | -| `rippled_validation_missed_total` | ValidationTracker reconciliation | ValidationTracker.cpp | -| `rippled_state_changes_total` | `setMode()` in NetworkOPs | NetworkOPs.cpp | -| `rippled_jq_trans_overflow_total` | Job queue overflow path | JobQueue.cpp | +| Counter Name | Increment Site | Source File | +| ----------------------------------- | -------------------------------- | --------------------- | +| `xrpld_ledgers_closed_total` | `onAccept()` in consensus | RCLConsensus.cpp | +| `xrpld_validations_sent_total` | `validate()` in consensus | RCLConsensus.cpp | +| `xrpld_validations_checked_total` | Network validation received | LedgerMaster.cpp | +| `xrpld_validation_agreements_total` | ValidationTracker reconciliation | ValidationTracker.cpp | +| `xrpld_validation_missed_total` | ValidationTracker reconciliation | ValidationTracker.cpp | +| `xrpld_state_changes_total` | `setMode()` in NetworkOPs | NetworkOPs.cpp | +| `xrpld_jq_trans_overflow_total` | Job queue overflow path | JobQueue.cpp | **Key modified files**: `src/xrpld/telemetry/MetricsRegistry.h/.cpp` (declarations), plus recording sites in RCLConsensus.cpp, LedgerMaster.cpp, NetworkOPs.cpp, JobQueue.cpp @@ -533,14 +533,14 @@ validatorParticipationGauge_ = meter_->CreateInt64ObservableGauge( **Gauge label values**: -| Gauge Name | Label `metric=` | Type | Source | -| ------------------------------ | ------------------- | ------ | --------------------------- | -| `rippled_validation_agreement` | `agreement_pct_1h` | double | `tracker.agreementPct1h()` | -| | `agreements_1h` | int64 | `tracker.agreements1h()` | -| | `missed_1h` | int64 | `tracker.missed1h()` | -| | `agreement_pct_24h` | double | `tracker.agreementPct24h()` | -| | `agreements_24h` | int64 | `tracker.agreements24h()` | -| | `missed_24h` | int64 | `tracker.missed24h()` | +| Gauge Name | Label `metric=` | Type | Source | +| ---------------------------- | ------------------- | ------ | --------------------------- | +| `xrpld_validation_agreement` | `agreement_pct_1h` | double | `tracker.agreementPct1h()` | +| | `agreements_1h` | int64 | `tracker.agreements1h()` | +| | `missed_1h` | int64 | `tracker.missed1h()` | +| | `agreement_pct_24h` | double | `tracker.agreementPct24h()` | +| | `agreements_24h` | int64 | `tracker.agreements24h()` | +| | `missed_24h` | int64 | `tracker.missed24h()` | **Key modified files**: `src/xrpld/telemetry/MetricsRegistry.cpp` diff --git a/docs/superpowers/specs/2026-03-30-external-dashboard-parity-design.md b/docs/superpowers/specs/2026-03-30-external-dashboard-parity-design.md index fbe4dda696..1223cbdbe7 100644 --- a/docs/superpowers/specs/2026-03-30-external-dashboard-parity-design.md +++ b/docs/superpowers/specs/2026-03-30-external-dashboard-parity-design.md @@ -7,38 +7,38 @@ ## Summary -Integrate 29 missing metrics, 18 alert rules, and enriched span attributes from the community `xrpl-validator-dashboard` into rippled's native OpenTelemetry instrumentation. Changes are distributed across phases 2, 3, 4, 6, 7, 9, 10, and 11 of the OTel PR chain. +Integrate 29 missing metrics, 18 alert rules, and enriched span attributes from the community `xrpl-validator-dashboard` into xrpld's native OpenTelemetry instrumentation. Changes are distributed across phases 2, 3, 4, 6, 7, 9, 10, and 11 of the OTel PR chain. ## Gap Analysis ### Coverage Breakdown (86 external metrics) -| Status | Count | Notes | -| ------------------ | ----- | ------------------------------------------------------------- | -| Already covered | 30 | peer_count, load_factor, io_latency, uptime, overlay traffic | -| Partially covered | 3 | state_value encoding, NuDB granularity, validation_quorum | -| Missing | 29 | Validation agreement, ledger economy, peer quality, UNL health | -| N/A (external) | 24 | Monitor health, realtime duplicates, system metrics | +| Status | Count | Notes | +| ----------------- | ----- | -------------------------------------------------------------- | +| Already covered | 30 | peer_count, load_factor, io_latency, uptime, overlay traffic | +| Partially covered | 3 | state_value encoding, NuDB granularity, validation_quorum | +| Missing | 29 | Validation agreement, ledger economy, peer quality, UNL health | +| N/A (external) | 24 | Monitor health, realtime duplicates, system metrics | ### Missing Metrics by Category -| Category | Metrics | Count | -| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | -| Validation Agreement | `validations_sent_total`, `validations_checked_total`, `validation_agreements_total`, `validation_missed_total`, `validation_agreement_pct_1h/24h`, `validation_agreements_1h/24h`, `validation_missed_1h/24h`, `validation_event` | 11 | -| Ledger Economy | `ledgers_closed_total`, `ledger_age_seconds`, `base_fee_xrp`, `reserve_base_xrp`, `reserve_inc_xrp`, `transaction_rate` | 6 | -| State Tracking | `time_in_current_state_seconds`, `state_changes_total`, `validator_state_info` | 3 | -| Peer Quality | `peers_insane`, `peer_latency_p90_ms` | 2 | -| Validator Health | `amendment_blocked`, `unl_expiry_days` | 2 | -| Upgrade Awareness | `peers_higher_version_pct`, `upgrade_recommended` | 2 | -| Storage / Other | `ledger_nudb_bytes`, `jq_trans_overflow_total`, `initial_sync_duration_seconds` | 3 | +| Category | Metrics | Count | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | +| Validation Agreement | `validations_sent_total`, `validations_checked_total`, `validation_agreements_total`, `validation_missed_total`, `validation_agreement_pct_1h/24h`, `validation_agreements_1h/24h`, `validation_missed_1h/24h`, `validation_event` | 11 | +| Ledger Economy | `ledgers_closed_total`, `ledger_age_seconds`, `base_fee_xrp`, `reserve_base_xrp`, `reserve_inc_xrp`, `transaction_rate` | 6 | +| State Tracking | `time_in_current_state_seconds`, `state_changes_total`, `validator_state_info` | 3 | +| Peer Quality | `peers_insane`, `peer_latency_p90_ms` | 2 | +| Validator Health | `amendment_blocked`, `unl_expiry_days` | 2 | +| Upgrade Awareness | `peers_higher_version_pct`, `upgrade_recommended` | 2 | +| Storage / Other | `ledger_nudb_bytes`, `jq_trans_overflow_total`, `initial_sync_duration_seconds` | 3 | ### Alert Rules (18 total, from external dashboard) -| Group | Count | Rules | -| ----------- | ----- | ---------------------------------------------------------------------------------------------------- | +| Group | Count | Rules | +| ----------- | ----- | ----------------------------------------------------------------------------------------------------------------------- | | Critical | 8 | Agreement <90%, not proposing, unhealthy state, amendment blocked, UNL expiring, IO latency, load factor, peer count <5 | -| Network | 3 | Peer drop >10%/30%, P90 latency + disconnect correlation | -| Performance | 7 | CPU >80%, memory >90%, disk >85%, job queue overflow, upgrade recommended, tx rate drop, stale ledger | +| Network | 3 | Peer drop >10%/30%, P90 latency + disconnect correlation | +| Performance | 7 | CPU >80%, memory >90%, disk >85%, job queue overflow, upgrade recommended, tx rate drop, stale ledger | --- @@ -54,16 +54,17 @@ Add node-level health context to every `rpc.command.*` span so operators can cor New span attributes on `rpc.command.*`: -| Attribute | Type | Source | Value Example | -| ----------------------------- | ------ | ---------------------------------- | ------------------------ | -| `xrpl.node.amendment_blocked` | bool | `app_.getOPs().isAmendmentBlocked()` | `true` | -| `xrpl.node.server_state` | string | `app_.getOPs().strOperatingMode()` | `"full"`, `"syncing"` | +| Attribute | Type | Source | Value Example | +| ----------------------------- | ------ | ------------------------------------ | --------------------- | +| `xrpl.node.amendment_blocked` | bool | `app_.getOPs().isAmendmentBlocked()` | `true` | +| `xrpl.node.server_state` | string | `app_.getOPs().strOperatingMode()` | `"full"`, `"syncing"` | **File**: `src/xrpld/rpc/detail/RPCHandler.cpp` (in the `rpc.command.*` span creation block, after existing setAttribute calls) **Rationale**: RPC is the operator's primary interaction point. When a node is amendment-blocked or degraded, every RPC response is suspect. Tagging spans with this state enables Jaeger queries like `{name=~"rpc.command.*"} | xrpl.node.amendment_blocked = true` to find all RPCs served during a blocked period. **Exit Criteria**: + - [ ] `rpc.command.server_info` spans carry `xrpl.node.amendment_blocked` and `xrpl.node.server_state` attributes - [ ] No measurable latency impact (attribute values are cached atomics, not computed per-call) @@ -75,19 +76,20 @@ New span attributes on `rpc.command.*`: **Task 3.7: Transaction Span Peer Version Attribute** -Add the relaying peer's rippled version to transaction receive spans to enable version-mismatch correlation. +Add the relaying peer's xrpld version to transaction receive spans to enable version-mismatch correlation. New span attribute on `tx.receive`: -| Attribute | Type | Source | Value Example | -| ------------------- | ------ | ------------------- | ------------------ | -| `xrpl.peer.version` | string | `peer->getVersion()` | `"rippled-2.4.0"` | +| Attribute | Type | Source | Value Example | +| ------------------- | ------ | -------------------- | --------------- | +| `xrpl.peer.version` | string | `peer->getVersion()` | `"xrpld-2.4.0"` | **File**: `src/xrpld/overlay/detail/PeerImp.cpp` (in the `tx.receive` span block, after existing `xrpl.peer.id` setAttribute) **Rationale**: Transaction relay is where version mismatches cause subtle serialization or validation bugs. Tracing "this tx came from a v2.3.0 peer" helps diagnose compatibility issues during network upgrades. **Exit Criteria**: + - [ ] `tx.receive` spans carry `xrpl.peer.version` attribute with a non-empty version string - [ ] Attribute is omitted (not empty-string) when `getVersion()` returns empty @@ -103,32 +105,34 @@ Add ledger hash and validation type to validation spans on both send and receive New span attributes on `consensus.validation.send`: -| Attribute | Type | Source | Value Example | -| ---------------------------- | ------ | --------------------------------------- | -------------------------- | +| Attribute | Type | Source | Value Example | +| ----------------------------- | ------ | --------------------------------------- | --------------------------- | | `xrpl.validation.ledger_hash` | string | Ledger hash from `validate()` call args | `"A1B2C3..."` (64-char hex) | -| `xrpl.validation.full` | bool | Whether this is a full validation | `true` | +| `xrpl.validation.full` | bool | Whether this is a full validation | `true` | New span attributes on `peer.validation.receive`: -| Attribute | Type | Source | Value Example | -| --------------------------------- | ------ | --------------------------------------- | -------------------------- | -| `xrpl.peer.validation.ledger_hash` | string | From deserialized STValidation object | `"A1B2C3..."` (64-char hex) | -| `xrpl.peer.validation.full` | bool | From STValidation flags | `true` | +| Attribute | Type | Source | Value Example | +| ---------------------------------- | ------ | ------------------------------------- | --------------------------- | +| `xrpl.peer.validation.ledger_hash` | string | From deserialized STValidation object | `"A1B2C3..."` (64-char hex) | +| `xrpl.peer.validation.full` | bool | From STValidation flags | `true` | New span attributes on `consensus.accept`: -| Attribute | Type | Source | Value Example | -| ------------------------------------ | ----- | -------------------------------------------- | ------------- | -| `xrpl.consensus.validation_quorum` | int64 | `app_.validators().quorum()` | `28` | -| `xrpl.consensus.proposers_validated` | int64 | `result.proposers` from consensus result | `35` | +| Attribute | Type | Source | Value Example | +| ------------------------------------ | ----- | ---------------------------------------- | ------------- | +| `xrpl.consensus.validation_quorum` | int64 | `app_.validators().quorum()` | `28` | +| `xrpl.consensus.proposers_validated` | int64 | `result.proposers` from consensus result | `35` | **Files**: + - `src/xrpld/app/consensus/RCLConsensus.cpp` (validation.send and accept spans) - `src/xrpld/overlay/detail/PeerImp.cpp` (peer.validation.receive span) **Rationale**: The external dashboard's most valuable feature is validation agreement tracking. By recording the ledger hash on both outgoing and incoming validation spans, we create the raw data for agreement analysis at the trace level. Phase 7's ValidationTracker builds the metric-level aggregation on top of this. **Exit Criteria**: + - [ ] `consensus.validation.send` spans carry `xrpl.validation.ledger_hash` and `xrpl.validation.full` - [ ] `peer.validation.receive` spans carry `xrpl.peer.validation.ledger_hash` and `xrpl.peer.validation.full` - [ ] `consensus.accept` spans carry `xrpl.consensus.validation_quorum` and `xrpl.consensus.proposers_validated` @@ -145,12 +149,13 @@ New span attributes on `consensus.accept`: The overlay already tracks resource-limit disconnects via `OverlayImpl::Stats::peerDisconnectsCharges_` (a `beast::insight::Gauge`). This metric is registered but not included in the StatsD bridge mapping. **What to do**: -- Ensure `rippled_Overlay_Peer_Disconnects_Charges` appears in the StatsD-to-Prometheus metric name mapping + +- Ensure `xrpld_Overlay_Peer_Disconnects_Charges` appears in the StatsD-to-Prometheus metric name mapping - Verify the metric appears in Prometheus after StatsD bridge is active **File**: `src/xrpld/overlay/detail/OverlayImpl.cpp` -**Prometheus name**: `rippled_Overlay_Peer_Disconnects_Charges` +**Prometheus name**: `xrpld_Overlay_Peer_Disconnects_Charges` --- @@ -225,23 +230,26 @@ class ValidationTracker **Recording sites** (modifications to consensus code from Phase 7 branch): -| Hook Point | File | What to Record | -| --- | --- | --- | -| `validate()` in `doAccept()` | RCLConsensus.cpp | `tracker.recordOurValidation(ledgerHash, seq)` | -| `onValidation()` callback | RCLValidations path | `tracker.recordNetworkValidation(...)` — increment `validationsChecked` | -| LedgerMaster fully-validated | LedgerMaster.cpp | `tracker.recordNetworkValidation(validatedHash, seq)` | +| Hook Point | File | What to Record | +| ---------------------------- | ------------------- | ----------------------------------------------------------------------- | +| `validate()` in `doAccept()` | RCLConsensus.cpp | `tracker.recordOurValidation(ledgerHash, seq)` | +| `onValidation()` callback | RCLValidations path | `tracker.recordNetworkValidation(...)` — increment `validationsChecked` | +| LedgerMaster fully-validated | LedgerMaster.cpp | `tracker.recordNetworkValidation(validatedHash, seq)` | **Key new files**: + - `src/xrpld/telemetry/ValidationTracker.h` - `src/xrpld/telemetry/detail/ValidationTracker.cpp` **Key modified files**: + - `src/xrpld/telemetry/MetricsRegistry.h` (add ValidationTracker member) - `src/xrpld/telemetry/MetricsRegistry.cpp` (add gauge callback reading from tracker) - `src/xrpld/app/consensus/RCLConsensus.cpp` (add recording hooks) - `src/xrpld/app/ledger/detail/LedgerMaster.cpp` (add recording hook) **Exit Criteria**: + - [ ] `ValidationTracker` correctly tracks agreement with 8s grace period - [ ] 5-minute late repair corrects false-positive misses - [ ] Thread-safe (atomics + mutex for window deques) @@ -254,16 +262,17 @@ class ValidationTracker New MetricsRegistry observable gauge for amendment, UNL, and quorum health. -| Gauge Name | Label `metric=` | Type | Source | -| ------------------------- | ----------------------- | ------- | ----------------------------------------- | -| `rippled_validator_health` | `amendment_blocked` | int64 | `app_.getOPs().isAmendmentBlocked()` → 0/1 | -| | `unl_blocked` | int64 | `app_.getOPs().isUNLBlocked()` → 0/1 | -| | `unl_expiry_days` | double | `app_.validators().expires()` → days until expiry | -| | `validation_quorum` | int64 | `app_.validators().quorum()` | +| Gauge Name | Label `metric=` | Type | Source | +| ------------------------ | ------------------- | ------ | ------------------------------------------------- | +| `xrpld_validator_health` | `amendment_blocked` | int64 | `app_.getOPs().isAmendmentBlocked()` → 0/1 | +| | `unl_blocked` | int64 | `app_.getOPs().isUNLBlocked()` → 0/1 | +| | `unl_expiry_days` | double | `app_.validators().expires()` → days until expiry | +| | `validation_quorum` | int64 | `app_.validators().quorum()` | **File**: `src/xrpld/telemetry/MetricsRegistry.cpp` (new gauge callback in `registerAsyncGauges()`) **Exit Criteria**: + - [ ] All 4 label values emitted every 10s - [ ] `unl_expiry_days` is negative when expired, positive when active - [ ] Values visible in Prometheus @@ -274,21 +283,22 @@ New MetricsRegistry observable gauge for amendment, UNL, and quorum health. New MetricsRegistry observable gauge for peer health aggregates. -| Gauge Name | Label `metric=` | Type | Source | -| ----------------------- | ------------------------- | ------- | ---------------------------------------------- | -| `rippled_peer_quality` | `peer_latency_p90_ms` | double | Iterate peers, compute P90 from `latency_` | -| | `peers_insane_count` | int64 | Count peers with `tracking_ == diverged` | -| | `peers_higher_version_pct` | double | Compare `getVersion()` to own version | -| | `upgrade_recommended` | int64 | 1 if `peers_higher_version_pct > 60%` | +| Gauge Name | Label `metric=` | Type | Source | +| -------------------- | -------------------------- | ------ | ------------------------------------------ | +| `xrpld_peer_quality` | `peer_latency_p90_ms` | double | Iterate peers, compute P90 from `latency_` | +| | `peers_insane_count` | int64 | Count peers with `tracking_ == diverged` | +| | `peers_higher_version_pct` | double | Compare `getVersion()` to own version | +| | `upgrade_recommended` | int64 | 1 if `peers_higher_version_pct > 60%` | **Implementation note**: The callback iterates `app_.overlay().foreach(...)` to collect per-peer latency and version data. This runs every 10s on the metrics reader thread — acceptable overhead for ~50-200 peers. **File**: `src/xrpld/telemetry/MetricsRegistry.cpp` **Exit Criteria**: + - [ ] P90 latency computed correctly (sort peer latencies, pick 90th percentile) - [ ] Insane count matches `peers` RPC output -- [ ] Version comparison handles format variations (e.g., "rippled-2.4.0-rc1") +- [ ] Version comparison handles format variations (e.g., "xrpld-2.4.0-rc1") - [ ] Values visible in Prometheus --- @@ -297,17 +307,18 @@ New MetricsRegistry observable gauge for peer health aggregates. New MetricsRegistry observable gauge for fee and ledger metrics. -| Gauge Name | Label `metric=` | Type | Source | -| ------------------------ | ---------------------- | ------- | ----------------------------------------------- | -| `rippled_ledger_economy` | `base_fee_xrp` | double | `app_.getFeeTrack().getBaseFee()` → drops | -| | `reserve_base_xrp` | double | From validated ledger fee settings | -| | `reserve_inc_xrp` | double | From validated ledger fee settings | -| | `ledger_age_seconds` | double | `now - lastValidatedCloseTime` | -| | `transaction_rate` | double | Derived: tx count delta / time delta | +| Gauge Name | Label `metric=` | Type | Source | +| ---------------------- | -------------------- | ------ | ----------------------------------------- | +| `xrpld_ledger_economy` | `base_fee_xrp` | double | `app_.getFeeTrack().getBaseFee()` → drops | +| | `reserve_base_xrp` | double | From validated ledger fee settings | +| | `reserve_inc_xrp` | double | From validated ledger fee settings | +| | `ledger_age_seconds` | double | `now - lastValidatedCloseTime` | +| | `transaction_rate` | double | Derived: tx count delta / time delta | **File**: `src/xrpld/telemetry/MetricsRegistry.cpp` **Exit Criteria**: + - [ ] Fee values match `server_info` RPC output - [ ] `ledger_age_seconds` increases monotonically between ledger closes, resets on close - [ ] `transaction_rate` is smoothed (rolling average, not instantaneous) @@ -318,30 +329,31 @@ New MetricsRegistry observable gauge for fee and ledger metrics. New MetricsRegistry observable gauge for node state duration. -| Gauge Name | Label `metric=` | Type | Source | -| ------------------------- | -------------------------------- | ------- | ---------------------------------------------- | -| `rippled_state_tracking` | `state_value` | int64 | 0-7 numeric encoding matching external dashboard | -| | `time_in_current_state_seconds` | double | `now - lastModeChangeTime` | +| Gauge Name | Label `metric=` | Type | Source | +| ---------------------- | ------------------------------- | ------ | ------------------------------------------------ | +| `xrpld_state_tracking` | `state_value` | int64 | 0-7 numeric encoding matching external dashboard | +| | `time_in_current_state_seconds` | double | `now - lastModeChangeTime` | **State value encoding**: -rippled's `OperatingMode` enum maps 0-4 (DISCONNECTED through FULL). The external dashboard extends this to 0-6 by combining operating mode with consensus participation: +xrpld's `OperatingMode` enum maps 0-4 (DISCONNECTED through FULL). The external dashboard extends this to 0-6 by combining operating mode with consensus participation: -| Value | State | Source | -| ----- | ------------ | ---------------------------------------------------------------- | -| 0 | disconnected | `OperatingMode::DISCONNECTED` | -| 1 | connected | `OperatingMode::CONNECTED` | -| 2 | syncing | `OperatingMode::SYNCING` | -| 3 | tracking | `OperatingMode::TRACKING` | -| 4 | full | `OperatingMode::FULL` and not validating | -| 5 | validating | `OperatingMode::FULL` and `mConsensus.validating()` is true | -| 6 | proposing | `OperatingMode::FULL` and consensus mode is `proposing` | +| Value | State | Source | +| ----- | ------------ | ----------------------------------------------------------- | +| 0 | disconnected | `OperatingMode::DISCONNECTED` | +| 1 | connected | `OperatingMode::CONNECTED` | +| 2 | syncing | `OperatingMode::SYNCING` | +| 3 | tracking | `OperatingMode::TRACKING` | +| 4 | full | `OperatingMode::FULL` and not validating | +| 5 | validating | `OperatingMode::FULL` and `mConsensus.validating()` is true | +| 6 | proposing | `OperatingMode::FULL` and consensus mode is `proposing` | **Note**: Values 5-6 require checking both `OperatingMode` and `ConsensusMode`. The callback should derive these from `app_.getOPs().getOperatingMode()` combined with `mConsensus.mode()`. If operating mode is FULL and consensus is proposing → 6; if FULL and validating → 5; otherwise use the raw OperatingMode enum value. **File**: `src/xrpld/telemetry/MetricsRegistry.cpp` **Exit Criteria**: + - [ ] `state_value` matches external dashboard encoding - [ ] `time_in_current_state_seconds` resets on mode change @@ -349,13 +361,14 @@ rippled's `OperatingMode` enum maps 0-4 (DISCONNECTED through FULL). The externa **Task 7.13: Storage Detail Observable Gauge** -| Gauge Name | Label `metric=` | Type | Source | -| -------------------------- | ---------------- | ----- | ----------------------------------------- | -| `rippled_storage_detail` | `nudb_bytes` | int64 | NuDB backend file size (filesystem stat) | +| Gauge Name | Label `metric=` | Type | Source | +| ---------------------- | --------------- | ----- | ---------------------------------------- | +| `xrpld_storage_detail` | `nudb_bytes` | int64 | NuDB backend file size (filesystem stat) | **File**: `src/xrpld/telemetry/MetricsRegistry.cpp` **Exit Criteria**: + - [ ] NuDB file size reported in bytes - [ ] Gracefully returns 0 if NuDB not configured @@ -365,23 +378,25 @@ rippled's `OperatingMode` enum maps 0-4 (DISCONNECTED through FULL). The externa New counters incremented at event sites. Declared in MetricsRegistry, recording sites added in consensus/overlay/network code. -| Counter Name | Increment Site | Source File | -| -------------------------------------- | --------------------------------- | ---------------------- | -| `rippled_ledgers_closed_total` | `onAccept()` in consensus | RCLConsensus.cpp | -| `rippled_validations_sent_total` | `validate()` in consensus | RCLConsensus.cpp | -| `rippled_validations_checked_total` | Network validation received | LedgerMaster.cpp | -| `rippled_validation_agreements_total` | ValidationTracker reconciliation | ValidationTracker.cpp | -| `rippled_validation_missed_total` | ValidationTracker reconciliation | ValidationTracker.cpp | -| `rippled_state_changes_total` | `setMode()` in NetworkOPs | NetworkOPs.cpp | -| `rippled_jq_trans_overflow_total` | Job queue overflow path | JobQueue.cpp | +| Counter Name | Increment Site | Source File | +| ----------------------------------- | -------------------------------- | --------------------- | +| `xrpld_ledgers_closed_total` | `onAccept()` in consensus | RCLConsensus.cpp | +| `xrpld_validations_sent_total` | `validate()` in consensus | RCLConsensus.cpp | +| `xrpld_validations_checked_total` | Network validation received | LedgerMaster.cpp | +| `xrpld_validation_agreements_total` | ValidationTracker reconciliation | ValidationTracker.cpp | +| `xrpld_validation_missed_total` | ValidationTracker reconciliation | ValidationTracker.cpp | +| `xrpld_state_changes_total` | `setMode()` in NetworkOPs | NetworkOPs.cpp | +| `xrpld_jq_trans_overflow_total` | Job queue overflow path | JobQueue.cpp | **Key modified files**: + - `src/xrpld/telemetry/MetricsRegistry.h/.cpp` (counter declarations) - `src/xrpld/app/consensus/RCLConsensus.cpp` (recording: ledgers_closed, validations_sent) - `src/xrpld/app/ledger/detail/LedgerMaster.cpp` (recording: validations_checked) - `src/xrpld/app/misc/NetworkOPs.cpp` (recording: state_changes) **Exit Criteria**: + - [ ] All 7 counters monotonically increase during normal operation - [ ] Counter values match expected rates (e.g., ledgers_closed ≈ 1 per 3-5s) - [ ] Values visible in Prometheus @@ -392,18 +407,19 @@ New counters incremented at event sites. Declared in MetricsRegistry, recording Reads from the `ValidationTracker` (Task 7.8) to export rolling window stats. -| Gauge Name | Label `metric=` | Type | Source | -| --------------------------------- | -------------------------- | ------ | ------------------------------- | -| `rippled_validation_agreement` | `agreement_pct_1h` | double | `tracker.agreementPct1h()` | -| | `agreements_1h` | int64 | `tracker.agreements1h()` | -| | `missed_1h` | int64 | `tracker.missed1h()` | -| | `agreement_pct_24h` | double | `tracker.agreementPct24h()` | -| | `agreements_24h` | int64 | `tracker.agreements24h()` | -| | `missed_24h` | int64 | `tracker.missed24h()` | +| Gauge Name | Label `metric=` | Type | Source | +| ---------------------------- | ------------------- | ------ | --------------------------- | +| `xrpld_validation_agreement` | `agreement_pct_1h` | double | `tracker.agreementPct1h()` | +| | `agreements_1h` | int64 | `tracker.agreements1h()` | +| | `missed_1h` | int64 | `tracker.missed1h()` | +| | `agreement_pct_24h` | double | `tracker.agreementPct24h()` | +| | `agreements_24h` | int64 | `tracker.agreements24h()` | +| | `missed_24h` | int64 | `tracker.missed24h()` | **File**: `src/xrpld/telemetry/MetricsRegistry.cpp` **Exit Criteria**: + - [ ] Agreement percentages in range [0.0, 100.0] - [ ] Window stats match manual count from validation counters - [ ] Percentages stabilize after 1h/24h of operation @@ -416,23 +432,23 @@ Reads from the `ValidationTracker` (Task 7.8) to export rolling window stats. **Task 9.11: Validator Health Dashboard** -New Grafana dashboard: `rippled-validator-health.json` +New Grafana dashboard: `xrpld-validator-health.json` -| Panel | Type | PromQL | -| --------------------------- | ---------- | -------------------------------------------------------------------- | -| Agreement % (1h) | stat | `rippled_validation_agreement{metric="agreement_pct_1h"}` | -| Agreement % (24h) | stat | `rippled_validation_agreement{metric="agreement_pct_24h"}` | -| Agreements vs Missed (1h) | bargauge | `agreements_1h` and `missed_1h` side by side | -| Agreements vs Missed (24h) | bargauge | `agreements_24h` and `missed_24h` side by side | -| Validation Rate | stat | `rate(rippled_validations_sent_total[5m]) * 60` | -| Validations Checked Rate | stat | `rate(rippled_validations_checked_total[5m]) * 60` | -| Amendment Blocked | stat | `rippled_validator_health{metric="amendment_blocked"}` | -| UNL Expiry (days) | stat | `rippled_validator_health{metric="unl_expiry_days"}` | -| Validation Quorum | stat | `rippled_validator_health{metric="validation_quorum"}` | -| State Value Timeline | timeseries | `rippled_state_tracking{metric="state_value"}` | -| Time in Current State | stat | `rippled_state_tracking{metric="time_in_current_state_seconds"}` | -| State Changes Rate | stat | `rate(rippled_state_changes_total[1h])` | -| Ledgers Closed Rate | stat | `rate(rippled_ledgers_closed_total[5m]) * 60` | +| Panel | Type | PromQL | +| -------------------------- | ---------- | -------------------------------------------------------------- | +| Agreement % (1h) | stat | `xrpld_validation_agreement{metric="agreement_pct_1h"}` | +| Agreement % (24h) | stat | `xrpld_validation_agreement{metric="agreement_pct_24h"}` | +| Agreements vs Missed (1h) | bargauge | `agreements_1h` and `missed_1h` side by side | +| Agreements vs Missed (24h) | bargauge | `agreements_24h` and `missed_24h` side by side | +| Validation Rate | stat | `rate(xrpld_validations_sent_total[5m]) * 60` | +| Validations Checked Rate | stat | `rate(xrpld_validations_checked_total[5m]) * 60` | +| Amendment Blocked | stat | `xrpld_validator_health{metric="amendment_blocked"}` | +| UNL Expiry (days) | stat | `xrpld_validator_health{metric="unl_expiry_days"}` | +| Validation Quorum | stat | `xrpld_validator_health{metric="validation_quorum"}` | +| State Value Timeline | timeseries | `xrpld_state_tracking{metric="state_value"}` | +| Time in Current State | stat | `xrpld_state_tracking{metric="time_in_current_state_seconds"}` | +| State Changes Rate | stat | `rate(xrpld_state_changes_total[1h])` | +| Ledgers Closed Rate | stat | `rate(xrpld_ledgers_closed_total[5m]) * 60` | **Dashboard conventions**: `$node` template variable for `exported_instance` filtering, dark theme, matching existing panel sizes and color schemes. @@ -440,16 +456,16 @@ New Grafana dashboard: `rippled-validator-health.json` **Task 9.12: Peer Quality Dashboard** -New Grafana dashboard: `rippled-peer-quality.json` +New Grafana dashboard: `xrpld-peer-quality.json` -| Panel | Type | PromQL | -| --------------------------- | ---------- | --------------------------------------------------------------------- | -| P90 Peer Latency | timeseries | `rippled_peer_quality{metric="peer_latency_p90_ms"}` | -| Insane/Diverged Peers | stat | `rippled_peer_quality{metric="peers_insane_count"}` | -| Higher Version Peers % | stat | `rippled_peer_quality{metric="peers_higher_version_pct"}` | -| Upgrade Recommended | stat | `rippled_peer_quality{metric="upgrade_recommended"}` | -| Resource Disconnects | timeseries | `rippled_Overlay_Peer_Disconnects_Charges` | -| Inbound vs Outbound | bargauge | `rippled_Peer_Finder_Active_Inbound_Peers`, `..._Outbound_Peers` | +| Panel | Type | PromQL | +| ---------------------- | ---------- | -------------------------------------------------------------- | +| P90 Peer Latency | timeseries | `xrpld_peer_quality{metric="peer_latency_p90_ms"}` | +| Insane/Diverged Peers | stat | `xrpld_peer_quality{metric="peers_insane_count"}` | +| Higher Version Peers % | stat | `xrpld_peer_quality{metric="peers_higher_version_pct"}` | +| Upgrade Recommended | stat | `xrpld_peer_quality{metric="upgrade_recommended"}` | +| Resource Disconnects | timeseries | `xrpld_Overlay_Peer_Disconnects_Charges` | +| Inbound vs Outbound | bargauge | `xrpld_Peer_Finder_Active_Inbound_Peers`, `..._Outbound_Peers` | --- @@ -457,13 +473,13 @@ New Grafana dashboard: `rippled-peer-quality.json` Add a "Ledger Economy" row to the existing `system-node-health.json` dashboard: -| Panel | Type | PromQL | -| --------------------- | ---------- | -------------------------------------------------------------- | -| Base Fee (drops) | stat | `rippled_ledger_economy{metric="base_fee_xrp"}` | -| Reserve Base (drops) | stat | `rippled_ledger_economy{metric="reserve_base_xrp"}` | -| Reserve Inc (drops) | stat | `rippled_ledger_economy{metric="reserve_inc_xrp"}` | -| Ledger Age | stat | `rippled_ledger_economy{metric="ledger_age_seconds"}` | -| Transaction Rate | timeseries | `rippled_ledger_economy{metric="transaction_rate"}` | +| Panel | Type | PromQL | +| -------------------- | ---------- | --------------------------------------------------- | +| Base Fee (drops) | stat | `xrpld_ledger_economy{metric="base_fee_xrp"}` | +| Reserve Base (drops) | stat | `xrpld_ledger_economy{metric="reserve_base_xrp"}` | +| Reserve Inc (drops) | stat | `xrpld_ledger_economy{metric="reserve_inc_xrp"}` | +| Ledger Age | stat | `xrpld_ledger_economy{metric="ledger_age_seconds"}` | +| Transaction Rate | timeseries | `xrpld_ledger_economy{metric="transaction_rate"}` | --- @@ -480,7 +496,7 @@ Add checks to `validate_telemetry.py` for all new span attributes and metrics. | Span Name | New Attribute | | --------------------------- | ------------------------------------ | | `rpc.command.server_info` | `xrpl.node.amendment_blocked` | -| `rpc.command.server_info` | `xrpl.node.server_state` | +| `rpc.command.server_info` | `xrpl.node.server_state` | | `tx.receive` | `xrpl.peer.version` | | `consensus.validation.send` | `xrpl.validation.ledger_hash` | | `consensus.validation.send` | `xrpl.validation.full` | @@ -490,38 +506,38 @@ Add checks to `validate_telemetry.py` for all new span attributes and metrics. **New metric existence checks (~13)**: -| Metric Name | -| ------------------------------------------------------------- | -| `rippled_validation_agreement{metric="agreement_pct_1h"}` | -| `rippled_validation_agreement{metric="agreement_pct_24h"}` | -| `rippled_validator_health{metric="amendment_blocked"}` | -| `rippled_validator_health{metric="unl_expiry_days"}` | -| `rippled_peer_quality{metric="peer_latency_p90_ms"}` | -| `rippled_peer_quality{metric="peers_insane_count"}` | -| `rippled_ledger_economy{metric="base_fee_xrp"}` | -| `rippled_ledger_economy{metric="transaction_rate"}` | -| `rippled_state_tracking{metric="state_value"}` | -| `rippled_ledgers_closed_total` | -| `rippled_validations_sent_total` | -| `rippled_state_changes_total` | -| `rippled_storage_detail{metric="nudb_bytes"}` | +| Metric Name | +| -------------------------------------------------------- | +| `xrpld_validation_agreement{metric="agreement_pct_1h"}` | +| `xrpld_validation_agreement{metric="agreement_pct_24h"}` | +| `xrpld_validator_health{metric="amendment_blocked"}` | +| `xrpld_validator_health{metric="unl_expiry_days"}` | +| `xrpld_peer_quality{metric="peer_latency_p90_ms"}` | +| `xrpld_peer_quality{metric="peers_insane_count"}` | +| `xrpld_ledger_economy{metric="base_fee_xrp"}` | +| `xrpld_ledger_economy{metric="transaction_rate"}` | +| `xrpld_state_tracking{metric="state_value"}` | +| `xrpld_ledgers_closed_total` | +| `xrpld_validations_sent_total` | +| `xrpld_state_changes_total` | +| `xrpld_storage_detail{metric="nudb_bytes"}` | **New dashboard load checks (~3)**: -| Dashboard | -| --------------------------- | -| `rippled-validator-health` | -| `rippled-peer-quality` | +| Dashboard | +| ------------------------------ | +| `xrpld-validator-health` | +| `xrpld-peer-quality` | | `system-node-health` (updated) | **New metric value sanity checks (~4)**: -| Check | Condition | -| -------------------------------------------------------- | -------------------- | -| `validation_agreement_pct_1h` | in [0, 100] | -| `unl_expiry_days` | > 0 (not expired) | -| `peer_latency_p90_ms` | > 0 (peers exist) | -| `state_value` | in [0, 7] | +| Check | Condition | +| ----------------------------- | ----------------- | +| `validation_agreement_pct_1h` | in [0, 100] | +| `unl_expiry_days` | > 0 (not expired) | +| `peer_latency_p90_ms` | > 0 (peers exist) | +| `state_value` | in [0, 7] | **Total new checks: ~28** (bringing total from 73 to ~101) @@ -537,40 +553,41 @@ Port 18 alert rules from the external `xrpl-validator-dashboard` to Grafana aler **Critical Group** (8 rules, eval interval 10s): -| Rule | Condition | For | -| ------------------------- | ----------------------------------------------------------------- | ---- | -| Agreement Below 90% | `rippled_validation_agreement{metric="agreement_pct_24h"} < 90` | 30s | -| Not Proposing | `rippled_state_tracking{metric="state_value"} < 6` | 10s | -| Unhealthy State | `rippled_state_tracking{metric="state_value"} < 4` | 10s | -| Amendment Blocked | `rippled_validator_health{metric="amendment_blocked"} == 1` | 1m | -| UNL Expiring | `rippled_validator_health{metric="unl_expiry_days"} < 14` | 1h | -| High IO Latency | `histogram_quantile(0.95, rippled_ios_latency_bucket) > 50` | 1m | -| High Load Factor | `rippled_load_factor_metrics{metric="load_factor"} > 1000` | 1m | -| Peer Count Critical | `rippled_server_info{metric="peers"} < 5` | 1m | +| Rule | Condition | For | +| ------------------- | ------------------------------------------------------------- | --- | +| Agreement Below 90% | `xrpld_validation_agreement{metric="agreement_pct_24h"} < 90` | 30s | +| Not Proposing | `xrpld_state_tracking{metric="state_value"} < 6` | 10s | +| Unhealthy State | `xrpld_state_tracking{metric="state_value"} < 4` | 10s | +| Amendment Blocked | `xrpld_validator_health{metric="amendment_blocked"} == 1` | 1m | +| UNL Expiring | `xrpld_validator_health{metric="unl_expiry_days"} < 14` | 1h | +| High IO Latency | `histogram_quantile(0.95, xrpld_ios_latency_bucket) > 50` | 1m | +| High Load Factor | `xrpld_load_factor_metrics{metric="load_factor"} > 1000` | 1m | +| Peer Count Critical | `xrpld_server_info{metric="peers"} < 5` | 1m | **Network Group** (3 rules, eval interval 10s): -| Rule | Condition | For | -| ---------------------- | --------------------------------------------------------------------- | ---- | -| Peer Drop >10% | `delta(rippled_server_info{metric="peers"}[30s]) / ... * 100 < -10` | 30s | -| Peer Drop >30% | Same formula, threshold -30 | 30s | -| P90 Latency + Disconnects | `peer_latency_p90_ms > 500 AND rate(disconnects) > 0` | 2m | +| Rule | Condition | For | +| ------------------------- | ----------------------------------------------------------------- | --- | +| Peer Drop >10% | `delta(xrpld_server_info{metric="peers"}[30s]) / ... * 100 < -10` | 30s | +| Peer Drop >30% | Same formula, threshold -30 | 30s | +| P90 Latency + Disconnects | `peer_latency_p90_ms > 500 AND rate(disconnects) > 0` | 2m | **Performance Group** (7 rules, eval interval 10s): -| Rule | Condition | For | -| -------------------- | ------------------------------------------------------------ | ---- | -| CPU High | Per-core CPU > 80% | 2m | -| Memory Critical | Memory usage > 90% | 1m | -| Disk Warning | Disk usage > 85% | 2m | -| Job Queue Overflow | `rate(rippled_jq_trans_overflow_total[5m]) > 0` | 1m | -| Upgrade Recommended | `rippled_peer_quality{metric="peers_higher_version_pct"} > 60` | 1m | -| TX Rate Drop | Transaction rate dropped > 50% in 5m window | 5m | -| Stale Ledger | `rippled_ledger_economy{metric="ledger_age_seconds"} > 30` | 1m | +| Rule | Condition | For | +| ------------------- | ------------------------------------------------------------ | --- | +| CPU High | Per-core CPU > 80% | 2m | +| Memory Critical | Memory usage > 90% | 1m | +| Disk Warning | Disk usage > 85% | 2m | +| Job Queue Overflow | `rate(xrpld_jq_trans_overflow_total[5m]) > 0` | 1m | +| Upgrade Recommended | `xrpld_peer_quality{metric="peers_higher_version_pct"} > 60` | 1m | +| TX Rate Drop | Transaction rate dropped > 50% in 5m window | 5m | +| Stale Ledger | `xrpld_ledger_economy{metric="ledger_age_seconds"} > 30` | 1m | **Notification channels**: Template configs for Email/SMTP, Discord, Slack, PagerDuty. **Files**: + - `docker/telemetry/grafana/alerting/alert-rules.yaml` (new or extend existing) - `docker/telemetry/grafana/alerting/contact-points.yaml` - `docker/telemetry/grafana/alerting/notification-policies.yaml` diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index d159d44a2f..7429a6cf13 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -288,7 +288,7 @@ Add to `xrpld.cfg`: [insight] server=statsd address=127.0.0.1:8125 -prefix=rippled +prefix=xrpld ``` The OTel Collector receives these via a `statsd` receiver on UDP port 8125 and exports them to Prometheus alongside spanmetrics. @@ -297,38 +297,38 @@ The OTel Collector receives these via a `statsd` receiver on UDP port 8125 and e #### Gauges -| Prometheus Metric | Source | Description | -| --------------------------------------------- | ------------------------- | -------------------------------------------------------------------------- | -| `rippled_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h:373 | Age of validated ledger (seconds) | -| `rippled_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h:374 | Age of published ledger (seconds) | -| `rippled_State_Accounting_{Mode}_duration` | NetworkOPs.cpp:774 | Time in each operating mode (Disconnected/Connected/Syncing/Tracking/Full) | -| `rippled_State_Accounting_{Mode}_transitions` | NetworkOPs.cpp:780 | Transition count per mode | -| `rippled_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp:214 | Active inbound peer connections | -| `rippled_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp:215 | Active outbound peer connections | -| `rippled_Overlay_Peer_Disconnects` | OverlayImpl.h:557 | Peer disconnect count | -| `rippled_job_count` | JobQueue.cpp:26 | Current job queue depth | -| `rippled_{category}_Bytes_In/Out` | OverlayImpl.h:535 | Overlay traffic bytes per category (57 categories) | -| `rippled_{category}_Messages_In/Out` | OverlayImpl.h:535 | Overlay traffic messages per category | +| Prometheus Metric | Source | Description | +| ------------------------------------------- | ------------------------- | -------------------------------------------------------------------------- | +| `xrpld_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h:373 | Age of validated ledger (seconds) | +| `xrpld_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h:374 | Age of published ledger (seconds) | +| `xrpld_State_Accounting_{Mode}_duration` | NetworkOPs.cpp:774 | Time in each operating mode (Disconnected/Connected/Syncing/Tracking/Full) | +| `xrpld_State_Accounting_{Mode}_transitions` | NetworkOPs.cpp:780 | Transition count per mode | +| `xrpld_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp:214 | Active inbound peer connections | +| `xrpld_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp:215 | Active outbound peer connections | +| `xrpld_Overlay_Peer_Disconnects` | OverlayImpl.h:557 | Peer disconnect count | +| `xrpld_job_count` | JobQueue.cpp:26 | Current job queue depth | +| `xrpld_{category}_Bytes_In/Out` | OverlayImpl.h:535 | Overlay traffic bytes per category (57 categories) | +| `xrpld_{category}_Messages_In/Out` | OverlayImpl.h:535 | Overlay traffic messages per category | #### Counters -| Prometheus Metric | Source | Description | -| --------------------------------- | --------------------- | ------------------------------ | -| `rippled_rpc_requests` | ServerHandler.cpp:108 | Total RPC request count | -| `rippled_ledger_fetches` | InboundLedgers.cpp:44 | Ledger fetch request count | -| `rippled_ledger_history_mismatch` | LedgerHistory.cpp:16 | Ledger hash mismatch count | -| `rippled_warn` | Logic.h:33 | Resource manager warning count | -| `rippled_drop` | Logic.h:34 | Resource manager drop count | +| Prometheus Metric | Source | Description | +| ------------------------------- | --------------------- | ------------------------------ | +| `xrpld_rpc_requests` | ServerHandler.cpp:108 | Total RPC request count | +| `xrpld_ledger_fetches` | InboundLedgers.cpp:44 | Ledger fetch request count | +| `xrpld_ledger_history_mismatch` | LedgerHistory.cpp:16 | Ledger hash mismatch count | +| `xrpld_warn` | Logic.h:33 | Resource manager warning count | +| `xrpld_drop` | Logic.h:34 | Resource manager drop count | #### Histograms (from StatsD timers) -| Prometheus Metric | Source | Description | -| ----------------------- | --------------------- | ------------------------------ | -| `rippled_rpc_time` | ServerHandler.cpp:110 | RPC response time (ms) | -| `rippled_rpc_size` | ServerHandler.cpp:109 | RPC response size (bytes) | -| `rippled_ios_latency` | Application.cpp:438 | I/O service loop latency (ms) | -| `rippled_pathfind_fast` | PathRequests.h:23 | Fast pathfinding duration (ms) | -| `rippled_pathfind_full` | PathRequests.h:24 | Full pathfinding duration (ms) | +| Prometheus Metric | Source | Description | +| --------------------- | --------------------- | ------------------------------ | +| `xrpld_rpc_time` | ServerHandler.cpp:110 | RPC response time (ms) | +| `xrpld_rpc_size` | ServerHandler.cpp:109 | RPC response size (bytes) | +| `xrpld_ios_latency` | Application.cpp:438 | I/O service loop latency (ms) | +| `xrpld_pathfind_fast` | PathRequests.h:23 | Fast pathfinding duration (ms) | +| `xrpld_pathfind_full` | PathRequests.h:24 | Full pathfinding duration (ms) | ## Grafana Dashboards @@ -401,52 +401,52 @@ Requires `trace_peer=1` in the `[telemetry]` config section. ### Node Health -- StatsD (`xrpld-statsd-node-health`) -| Panel | Type | PromQL | Labels Used | -| -------------------------------------- | ---------- | ----------------------------------------------------------------- | ----------- | -| Validated Ledger Age | stat | `rippled_LedgerMaster_Validated_Ledger_Age` | — | -| Published Ledger Age | stat | `rippled_LedgerMaster_Published_Ledger_Age` | — | -| Operating Mode Duration | timeseries | `rippled_State_Accounting_*_duration` | — | -| Operating Mode Transitions | timeseries | `rippled_State_Accounting_*_transitions` | — | -| I/O Latency | timeseries | `histogram_quantile(0.95, rippled_ios_latency_bucket)` | — | -| Job Queue Depth | timeseries | `rippled_job_count` | — | -| Ledger Fetch Rate | stat | `rate(rippled_ledger_fetches[5m])` | — | -| Ledger History Mismatches | stat | `rate(rippled_ledger_history_mismatch[5m])` | — | -| Key Jobs Execution Time | timeseries | `rippled_acceptLedger{quantile="$quantile"}` (+ 10 more key jobs) | `quantile` | -| Key Jobs Dequeue Wait Time | timeseries | `rippled_acceptLedger_q{quantile="$quantile"}` (+ 10 more) | `quantile` | -| FullBelowCache Size | timeseries | `rippled_Node_family_full_below_cache_size` | — | -| FullBelowCache Hit Rate | gauge | `rippled_Node_family_full_below_cache_hit_rate` | — | -| Ledger Publish Gap | stat | `Published_Ledger_Age - Validated_Ledger_Age` | — | -| State Duration Rate (Full vs Tracking) | timeseries | `rate(rippled_State_Accounting_Full_duration[5m]) / 1000000` | — | -| All Jobs Execution Time (Detail) | timeseries | `{__name__=~"rippled_", quantile="$quantile"}` | `quantile` | -| All Jobs Dequeue Wait (Detail) | timeseries | `{__name__=~"rippled__q", quantile="$quantile"}` | `quantile` | +| Panel | Type | PromQL | Labels Used | +| -------------------------------------- | ---------- | --------------------------------------------------------------- | ----------- | +| Validated Ledger Age | stat | `xrpld_LedgerMaster_Validated_Ledger_Age` | — | +| Published Ledger Age | stat | `xrpld_LedgerMaster_Published_Ledger_Age` | — | +| Operating Mode Duration | timeseries | `xrpld_State_Accounting_*_duration` | — | +| Operating Mode Transitions | timeseries | `xrpld_State_Accounting_*_transitions` | — | +| I/O Latency | timeseries | `histogram_quantile(0.95, xrpld_ios_latency_bucket)` | — | +| Job Queue Depth | timeseries | `xrpld_job_count` | — | +| Ledger Fetch Rate | stat | `rate(xrpld_ledger_fetches[5m])` | — | +| Ledger History Mismatches | stat | `rate(xrpld_ledger_history_mismatch[5m])` | — | +| Key Jobs Execution Time | timeseries | `xrpld_acceptLedger{quantile="$quantile"}` (+ 10 more key jobs) | `quantile` | +| Key Jobs Dequeue Wait Time | timeseries | `xrpld_acceptLedger_q{quantile="$quantile"}` (+ 10 more) | `quantile` | +| FullBelowCache Size | timeseries | `xrpld_Node_family_full_below_cache_size` | — | +| FullBelowCache Hit Rate | gauge | `xrpld_Node_family_full_below_cache_hit_rate` | — | +| Ledger Publish Gap | stat | `Published_Ledger_Age - Validated_Ledger_Age` | — | +| State Duration Rate (Full vs Tracking) | timeseries | `rate(xrpld_State_Accounting_Full_duration[5m]) / 1000000` | — | +| All Jobs Execution Time (Detail) | timeseries | `{__name__=~"xrpld_", quantile="$quantile"}` | `quantile` | +| All Jobs Dequeue Wait (Detail) | timeseries | `{__name__=~"xrpld__q", quantile="$quantile"}` | `quantile` | ### Network Traffic -- StatsD (`xrpld-statsd-network`) -| Panel | Type | PromQL | Labels Used | -| ------------------------------------ | ---------- | -------------------------------------------- | ----------- | -| Active Peers | timeseries | `rippled_Peer_Finder_Active_*_Peers` | — | -| Peer Disconnects | timeseries | `rippled_Overlay_Peer_Disconnects` | — | -| Total Network Bytes | timeseries | `rate(rippled_total_Bytes_In/Out[5m])` | — | -| Total Network Messages | timeseries | `rippled_total_Messages_In/Out` | — | -| Transaction Traffic | timeseries | `rippled_transactions_Messages_In/Out` | — | -| Proposal Traffic | timeseries | `rippled_proposals_Messages_In/Out` | — | -| Validation Traffic | timeseries | `rippled_validations_Messages_In/Out` | — | -| Traffic by Category | bargauge | `topk(10, rippled_*_Bytes_In)` | — | -| Duplicate Traffic (Wasted Bandwidth) | timeseries | `rate(rippled_*_duplicate_Bytes_In/Out[5m])` | — | -| All Traffic Categories (Detail) | timeseries | `topk(15, rate(rippled_*_Bytes_In[5m]))` | — | +| Panel | Type | PromQL | Labels Used | +| ------------------------------------ | ---------- | ------------------------------------------ | ----------- | +| Active Peers | timeseries | `xrpld_Peer_Finder_Active_*_Peers` | — | +| Peer Disconnects | timeseries | `xrpld_Overlay_Peer_Disconnects` | — | +| Total Network Bytes | timeseries | `rate(xrpld_total_Bytes_In/Out[5m])` | — | +| Total Network Messages | timeseries | `xrpld_total_Messages_In/Out` | — | +| Transaction Traffic | timeseries | `xrpld_transactions_Messages_In/Out` | — | +| Proposal Traffic | timeseries | `xrpld_proposals_Messages_In/Out` | — | +| Validation Traffic | timeseries | `xrpld_validations_Messages_In/Out` | — | +| Traffic by Category | bargauge | `topk(10, xrpld_*_Bytes_In)` | — | +| Duplicate Traffic (Wasted Bandwidth) | timeseries | `rate(xrpld_*_duplicate_Bytes_In/Out[5m])` | — | +| All Traffic Categories (Detail) | timeseries | `topk(15, rate(xrpld_*_Bytes_In[5m]))` | — | ### RPC & Pathfinding -- StatsD (`xrpld-statsd-rpc`) -| Panel | Type | PromQL | Labels Used | -| ------------------------- | ---------- | -------------------------------------------------------- | ----------- | -| RPC Request Rate | stat | `rate(rippled_rpc_requests[5m])` | — | -| RPC Response Time | timeseries | `histogram_quantile(0.95, rippled_rpc_time_bucket)` | — | -| RPC Response Size | timeseries | `histogram_quantile(0.95, rippled_rpc_size_bucket)` | — | -| RPC Response Time Heatmap | heatmap | `rippled_rpc_time_bucket` | — | -| Pathfinding Fast Duration | timeseries | `histogram_quantile(0.95, rippled_pathfind_fast_bucket)` | — | -| Pathfinding Full Duration | timeseries | `histogram_quantile(0.95, rippled_pathfind_full_bucket)` | — | -| Resource Warnings Rate | stat | `rate(rippled_warn[5m])` | — | -| Resource Drops Rate | stat | `rate(rippled_drop[5m])` | — | +| Panel | Type | PromQL | Labels Used | +| ------------------------- | ---------- | ------------------------------------------------------ | ----------- | +| RPC Request Rate | stat | `rate(xrpld_rpc_requests[5m])` | — | +| RPC Response Time | timeseries | `histogram_quantile(0.95, xrpld_rpc_time_bucket)` | — | +| RPC Response Size | timeseries | `histogram_quantile(0.95, xrpld_rpc_size_bucket)` | — | +| RPC Response Time Heatmap | heatmap | `xrpld_rpc_time_bucket` | — | +| Pathfinding Fast Duration | timeseries | `histogram_quantile(0.95, xrpld_pathfind_fast_bucket)` | — | +| Pathfinding Full Duration | timeseries | `histogram_quantile(0.95, xrpld_pathfind_full_bucket)` | — | +| Resource Warnings Rate | stat | `rate(xrpld_warn[5m])` | — | +| Resource Drops Rate | stat | `rate(xrpld_drop[5m])` | — | ### Span → Metric → Dashboard Summary diff --git a/include/xrpl/beast/insight/OTelCollector.h b/include/xrpl/beast/insight/OTelCollector.h index ee0dd2c1b0..8c982a8856 100644 --- a/include/xrpl/beast/insight/OTelCollector.h +++ b/include/xrpl/beast/insight/OTelCollector.h @@ -73,7 +73,7 @@ public: * @param endpoint OTLP/HTTP metrics endpoint URL * (e.g. "http://localhost:4318/v1/metrics"). * @param prefix Prefix prepended to all metric names - * (e.g. "rippled"). + * (e.g. "xrpld"). * @param instanceId Unique identifier for this node instance, * emitted as the `service.instance.id` OTel * resource attribute. Defaults to empty string diff --git a/src/libxrpl/beast/insight/OTelCollector.cpp b/src/libxrpl/beast/insight/OTelCollector.cpp index b4c684510b..3b5fdc09a1 100644 --- a/src/libxrpl/beast/insight/OTelCollector.cpp +++ b/src/libxrpl/beast/insight/OTelCollector.cpp @@ -30,13 +30,14 @@ #ifdef XRPL_ENABLE_TELEMETRY +#include + #include #include #include #include #include #include -#include #include #include @@ -357,10 +358,10 @@ private: * Example usage: * @code * auto collector = OTelCollector::New( - * "http://localhost:4318/v1/metrics", "rippled", journal); + * "http://localhost:4318/v1/metrics", "xrpld", journal); * auto counter = collector->make_counter("rpc.requests"); * counter.increment(1); - * // Metric "rippled_rpc_requests" exported via OTLP every 1s. + * // Metric "xrpld_rpc_requests" exported via OTLP every 1s. * @endcode */ class OTelCollectorImp : public OTelCollector, public std::enable_shared_from_this @@ -460,8 +461,8 @@ public: * @brief Format a metric name with the configured prefix. * * Replaces dots with underscores to match StatsD->Prometheus naming. - * Example: prefix="rippled", name="LedgerMaster.Validated_Ledger_Age" - * -> "rippled_LedgerMaster_Validated_Ledger_Age" + * Example: prefix="xrpld", name="LedgerMaster.Validated_Ledger_Age" + * -> "xrpld_LedgerMaster_Validated_Ledger_Age" * * @param name Raw metric name from beast::insight callers. * @return Fully-qualified metric name. @@ -473,7 +474,7 @@ private: /** Journal for log output. */ Journal m_journal; - /** Prefix for all metric names (e.g., "rippled"). */ + /** Prefix for all metric names (e.g., "xrpld"). */ std::string m_prefix; /** OTel SDK MeterProvider owning the export pipeline. RAII lifecycle. */ @@ -678,7 +679,7 @@ OTelCollectorImp::OTelCollectorImp( // Include service.instance.id when provided so Prometheus // exported_instance labels distinguish multi-node deployments. resource::ResourceAttributes attrs; - attrs[resource::SemanticConventions::kServiceName] = "rippled"; + attrs[resource::SemanticConventions::kServiceName] = "xrpld"; if (!instanceId.empty()) attrs[resource::SemanticConventions::kServiceInstanceId] = instanceId; auto resourceAttrs = resource::Resource::Create(attrs); @@ -692,7 +693,7 @@ OTelCollectorImp::OTelCollectorImp( // These match the SpanMetrics connector buckets for consistency. auto histogramSelector = metrics_sdk::InstrumentSelectorFactory::Create( metrics_sdk::InstrumentType::kHistogram, "*", "ms"); - auto meterSelector = metrics_sdk::MeterSelectorFactory::Create("rippled_metrics", "", ""); + auto meterSelector = metrics_sdk::MeterSelectorFactory::Create("xrpld_metrics", "", ""); auto histogramConfig = std::make_shared(); histogramConfig->boundaries_ = std::vector{1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, 5000.0}; @@ -707,7 +708,7 @@ OTelCollectorImp::OTelCollectorImp( std::move(histogramSelector), std::move(meterSelector), std::move(histogramView)); // Create the OTel Meter for creating instruments. - m_otelMeter = m_provider->GetMeter("rippled_metrics", "1.0.0"); + m_otelMeter = m_provider->GetMeter("xrpld_metrics", "1.0.0"); if (m_journal.info()) m_journal.info() << "OTelCollector started successfully"; @@ -820,8 +821,8 @@ OTelCollectorImp::formatName(std::string const& name) const // converts dots to underscores for Prometheus. We replicate this // to preserve metric name compatibility. // - // Example: prefix="rippled", name="LedgerMaster.Validated_Ledger_Age" - // -> "rippled_LedgerMaster_Validated_Ledger_Age" + // Example: prefix="xrpld", name="LedgerMaster.Validated_Ledger_Age" + // -> "xrpld_LedgerMaster_Validated_Ledger_Age" std::string result; if (!m_prefix.empty()) {