From cbbd6ebee258702e2de34e3d3009e2c05bf1ae13 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Mon, 27 Apr 2026 11:43:56 +0100 Subject: [PATCH] feat(telemetry): add Phase 6 StatsD metrics, ledger/peer spans, and expanded dashboards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integrate the existing StatsD metrics pipeline (beast::insight) into the OpenTelemetry observability stack and add new trace spans for ledger build/store/validate and peer proposal/validation receive. Phase 5b — Ledger, peer, and transaction spans: - Add ledger.build span with close time attributes in BuildLedger.cpp - Add tx.apply span with tx_count/tx_failed in BuildLedger.cpp - Add ledger.store and ledger.validate spans in LedgerMaster.cpp - Add peer.proposal.receive span with trusted attribute in PeerImp.cpp - Add peer.validation.receive span with ledger_hash, full, trusted attributes in PeerImp.cpp - Add ledger-operations and peer-network Grafana dashboards Phase 6 — StatsD metrics integration: - Add StatsD UDP receiver (port 8125) to OTel Collector - Add 5 StatsD Grafana dashboards: node health, network traffic, overlay traffic detail, ledger data sync, RPC pathfinding - Add 09-data-collection-reference.md cataloging all metrics/spans - Update existing dashboards with new span panels - Expand telemetry runbook and integration test script - Add codecov exclusions for telemetry modules Co-Authored-By: Claude Opus 4.6 (1M context) --- .codecov.yml | 5 + OpenTelemetryPlan/06-implementation-phases.md | 91 ++- OpenTelemetryPlan/08-appendix.md | 27 +- .../09-data-collection-reference.md | 549 ++++++++++++++ OpenTelemetryPlan/OpenTelemetryPlan.md | 36 +- docker/telemetry/TESTING.md | 48 +- docker/telemetry/docker-compose.yml | 9 +- .../grafana/dashboards/consensus-health.json | 252 ++++++- .../grafana/dashboards/ledger-operations.json | 353 +++++++++ .../grafana/dashboards/peer-network.json | 227 ++++++ .../grafana/dashboards/rpc-performance.json | 205 +++++- .../dashboards/statsd-ledger-data-sync.json | 506 +++++++++++++ .../dashboards/statsd-network-traffic.json | 671 ++++++++++++++++++ .../dashboards/statsd-node-health.json | 415 +++++++++++ .../statsd-overlay-traffic-detail.json | 566 +++++++++++++++ .../dashboards/statsd-rpc-pathfinding.json | 396 +++++++++++ .../dashboards/transaction-overview.json | 246 ++++++- docker/telemetry/integration-test.sh | 57 +- docker/telemetry/otel-collector-config.yaml | 2 + docs/telemetry-runbook.md | 218 +++++- src/xrpld/app/ledger/detail/BuildLedger.cpp | 19 + src/xrpld/app/ledger/detail/LedgerMaster.cpp | 11 + src/xrpld/app/ledger/detail/LedgerSpanNames.h | 54 ++ src/xrpld/overlay/detail/PeerImp.cpp | 15 + src/xrpld/overlay/detail/PeerSpanNames.h | 50 ++ 25 files changed, 4890 insertions(+), 138 deletions(-) create mode 100644 OpenTelemetryPlan/09-data-collection-reference.md create mode 100644 docker/telemetry/grafana/dashboards/ledger-operations.json create mode 100644 docker/telemetry/grafana/dashboards/peer-network.json create mode 100644 docker/telemetry/grafana/dashboards/statsd-ledger-data-sync.json create mode 100644 docker/telemetry/grafana/dashboards/statsd-network-traffic.json create mode 100644 docker/telemetry/grafana/dashboards/statsd-node-health.json create mode 100644 docker/telemetry/grafana/dashboards/statsd-overlay-traffic-detail.json create mode 100644 docker/telemetry/grafana/dashboards/statsd-rpc-pathfinding.json create mode 100644 src/xrpld/app/ledger/detail/LedgerSpanNames.h create mode 100644 src/xrpld/overlay/detail/PeerSpanNames.h diff --git a/.codecov.yml b/.codecov.yml index cd52e2604d..3d9d2734e8 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -36,3 +36,8 @@ ignore: - "src/tests/" - "include/xrpl/beast/test/" - "include/xrpl/beast/unit_test/" + # Telemetry modules — conditionally compiled behind XRPL_ENABLE_TELEMETRY, + # which is not enabled in coverage builds. + - "src/xrpld/telemetry/" + - "src/libxrpl/beast/insight/OTelCollector.cpp" + - "include/xrpl/beast/insight/OTelCollector.h" diff --git a/OpenTelemetryPlan/06-implementation-phases.md b/OpenTelemetryPlan/06-implementation-phases.md index c12fb8c211..4811fd1b66 100644 --- a/OpenTelemetryPlan/06-implementation-phases.md +++ b/OpenTelemetryPlan/06-implementation-phases.md @@ -300,7 +300,78 @@ See [Phase4_taskList.md § Phase 4b](./Phase4_taskList.md) for full design. --- -## 6.7 Risk Assessment +## 6.7 Phase 6: StatsD Metrics Integration (Week 10) + +**Objective**: Bridge rippled's existing `beast::insight` StatsD metrics into the OpenTelemetry collection pipeline, exposing 300+ pre-existing metrics alongside span-derived RED metrics in Prometheus/Grafana. + +### Background + +rippled has a mature metrics framework (`beast::insight`) that emits StatsD-format metrics over UDP. These metrics cover node health, peer networking, RPC performance, job queue, and overlay traffic — data that **does not** overlap with the span-based instrumentation from Phases 1-5. By adding a StatsD receiver to the OTel Collector, both metric sources converge in Prometheus. + +### Metric Inventory + +| Category | Group | Type | Count | Key Metrics | +| --------------- | ------------------ | ------------- | ---------- | ------------------------------------------------------ | +| Node State | `State_Accounting` | Gauge | 10 | `*_duration`, `*_transitions` per operating mode | +| Ledger | `LedgerMaster` | Gauge | 2 | `Validated_Ledger_Age`, `Published_Ledger_Age` | +| Ledger Fetch | — | Counter | 1 | `ledger_fetches` | +| Ledger History | `ledger.history` | Counter | 1 | `mismatch` | +| RPC | `rpc` | Counter+Event | 3 | `requests`, `time` (histogram), `size` (histogram) | +| Job Queue | — | Gauge+Event | 1 + 2×N | `job_count`, per-job `{name}` and `{name}_q` | +| Peer Finder | `Peer_Finder` | Gauge | 2 | `Active_Inbound_Peers`, `Active_Outbound_Peers` | +| Overlay | `Overlay` | Gauge | 1 | `Peer_Disconnects` | +| Overlay Traffic | per-category | Gauge | 4×57 = 228 | `Bytes_In/Out`, `Messages_In/Out` per traffic category | +| Pathfinding | — | Event | 2 | `pathfind_fast`, `pathfind_full` (histograms) | +| I/O | — | Event | 1 | `ios_latency` (histogram) | +| Resource Mgr | — | Meter | 2 | `warn`, `drop` (rate counters) | +| Caches | per-cache | Gauge | 2×N | `{cache}.size`, `{cache}.hit_rate` | + +**Total**: ~255+ unique metrics (plus dynamic job-type and cache metrics) + +### Tasks + +| Task | Description | +| ---- | --------------------------------------------------------------------------------------------------------------- | +| 6.1 | **DEFERRED** Fix Meter wire format (`\|m` → `\|c`) in StatsDCollector.cpp — breaking change, tracked separately | +| 6.2 | Add `statsd` receiver to OTel Collector config | +| 6.3 | Expose UDP port 8125 in docker-compose.yml | +| 6.4 | Add `[insight]` config to integration test node configs | +| 6.5 | Create "Node Health" Grafana dashboard (8 panels) | +| 6.6 | Create "Network Traffic" Grafana dashboard (8 panels) | +| 6.7 | Create "RPC & Pathfinding (StatsD)" Grafana dashboard (8 panels) | +| 6.8 | Update integration test to verify StatsD metrics in Prometheus | +| 6.9 | Update TESTING.md and telemetry-runbook.md | + +### Wire Format Fix (Task 6.1) — DEFERRED + +The `StatsDMeterImpl` in `StatsDCollector.cpp:706` sends metrics with `|m` suffix, which is non-standard StatsD. The OTel StatsD receiver silently drops these. Fix: change `|m` to `|c` (counter), which is semantically correct since meters are increment-only counters. Only 2 metrics are affected (`warn`, `drop` in Resource Manager). + +**Status**: Deferred as a separate change — this is a breaking change for any StatsD backend that previously consumed the custom `|m` type. The Resource Warnings and Resource Drops dashboard panels will show no data until this fix is applied. + +### New Grafana Dashboards + +**Node Health** (`statsd-node-health.json`, uid: `rippled-statsd-node-health`): + +- Validated/Published Ledger Age, Operating Mode Duration/Transitions, I/O Latency, Job Queue Depth, Ledger Fetch Rate, Ledger History Mismatches + +**Network Traffic** (`statsd-network-traffic.json`, uid: `rippled-statsd-network`): + +- Active Inbound/Outbound Peers, Peer Disconnects, Total Bytes/Messages In/Out, Transaction/Proposal/Validation Traffic, Top Traffic Categories + +**RPC & Pathfinding (StatsD)** (`statsd-rpc-pathfinding.json`, uid: `rippled-statsd-rpc`): + +- RPC Request Rate, Response Time p95/p50, Response Size p95/p50, Pathfinding Fast/Full Duration, Resource Warnings/Drops, Response Time Heatmap + +### Exit Criteria + +- [ ] StatsD metrics visible in Prometheus (`curl localhost:9090/api/v1/query?query=rippled_LedgerMaster_Validated_Ledger_Age`) +- [ ] All 3 new Grafana dashboards load without errors +- [ ] Integration test verifies at least core StatsD metrics (ledger age, peer counts, RPC requests) +- [ ] ~~Meter metrics (`warn`, `drop`) flow correctly after `|m` → `|c` fix~~ — DEFERRED (breaking change, tracked separately) + +--- + +## 6.9 Risk Assessment ```mermaid quadrantChart @@ -331,7 +402,7 @@ quadrantChart --- -## 6.8 Success Metrics +## 6.10 Success Metrics | Metric | Target | Measurement | | ------------------------ | -------------------------------------------------------------- | --------------------- | @@ -497,13 +568,13 @@ quadrantChart --- -## 6.10 Definition of Done +## 6.13 Definition of Done > **TxQ** = Transaction Queue | **HA** = High Availability Clear, measurable criteria for each phase. -### 6.10.1 Phase 1: Core Infrastructure +### 6.13.1 Phase 1: Core Infrastructure | Criterion | Measurement | Target | | --------------- | ---------------------------------------------------------- | ---------------------------- | @@ -515,7 +586,7 @@ Clear, measurable criteria for each phase. **Definition of Done**: All criteria met, PR merged, no regressions in CI. -### 6.10.2 Phase 2: RPC Tracing +### 6.13.2 Phase 2: RPC Tracing | Criterion | Measurement | Target | | ------------------ | ---------------------------------- | -------------------------- | @@ -527,7 +598,7 @@ Clear, measurable criteria for each phase. **Definition of Done**: RPC traces visible in Tempo for all commands, dashboard shows latency distribution. -### 6.10.3 Phase 3: Transaction Tracing +### 6.13.3 Phase 3: Transaction Tracing | Criterion | Measurement | Target | | --------------------- | ------------------------------------------------- | -------------------------------------------------------- | @@ -542,7 +613,7 @@ Clear, measurable criteria for each phase. **Definition of Done**: Transaction traces span 3+ nodes in test network with deterministic trace_id correlation, parent-child ordering via protobuf propagation, and performance within bounds. -### 6.10.4 Phase 4: Consensus Tracing +### 6.13.4 Phase 4: Consensus Tracing | Criterion | Measurement | Target | | -------------------- | ----------------------------- | ------------------------- | @@ -554,7 +625,7 @@ Clear, measurable criteria for each phase. **Definition of Done**: Consensus rounds fully traceable, no impact on consensus timing. -### 6.10.5 Phase 5: Production Deployment +### 6.13.5 Phase 5: Production Deployment | Criterion | Measurement | Target | | ------------ | ---------------------------- | -------------------------- | @@ -567,7 +638,7 @@ Clear, measurable criteria for each phase. **Definition of Done**: Telemetry running in production, operators trained, alerts active. -### 6.10.6 Success Metrics Summary +### 6.13.6 Success Metrics Summary | Phase | Primary Metric | Secondary Metric | Deadline | | ------- | ---------------------- | --------------------------- | ------------- | @@ -579,7 +650,7 @@ Clear, measurable criteria for each phase. --- -## 6.12 Recommended Implementation Order +## 6.14 Recommended Implementation Order Based on ROI analysis, implement in this exact order: diff --git a/OpenTelemetryPlan/08-appendix.md b/OpenTelemetryPlan/08-appendix.md index ffe3df303d..fea9694b77 100644 --- a/OpenTelemetryPlan/08-appendix.md +++ b/OpenTelemetryPlan/08-appendix.md @@ -170,19 +170,20 @@ flowchart TB ### Plan Documents -| Document | Description | -| ---------------------------------------------------------------- | -------------------------------------------- | -| [OpenTelemetryPlan.md](./OpenTelemetryPlan.md) | Master overview and executive summary | -| [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) | Distributed tracing concepts and OTel primer | -| [01-architecture-analysis.md](./01-architecture-analysis.md) | xrpld architecture and trace points | -| [02-design-decisions.md](./02-design-decisions.md) | SDK selection, exporters, span conventions | -| [03-implementation-strategy.md](./03-implementation-strategy.md) | Directory structure, performance analysis | -| [04-code-samples.md](./04-code-samples.md) | C++ code examples for all components | -| [05-configuration-reference.md](./05-configuration-reference.md) | xrpld config, CMake, Collector configs | -| [06-implementation-phases.md](./06-implementation-phases.md) | Timeline, tasks, risks, success metrics | -| [07-observability-backends.md](./07-observability-backends.md) | Backend selection and architecture | -| [08-appendix.md](./08-appendix.md) | Glossary, references, version history | -| [presentation.md](./presentation.md) | Slide deck for OTel plan overview | +| Document | Description | +| -------------------------------------------------------------------- | -------------------------------------------- | +| [OpenTelemetryPlan.md](./OpenTelemetryPlan.md) | Master overview and executive summary | +| [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) | Distributed tracing concepts and OTel primer | +| [01-architecture-analysis.md](./01-architecture-analysis.md) | xrpld architecture and trace points | +| [02-design-decisions.md](./02-design-decisions.md) | SDK selection, exporters, span conventions | +| [03-implementation-strategy.md](./03-implementation-strategy.md) | Directory structure, performance analysis | +| [04-code-samples.md](./04-code-samples.md) | C++ code examples for all components | +| [05-configuration-reference.md](./05-configuration-reference.md) | xrpld config, CMake, Collector configs | +| [06-implementation-phases.md](./06-implementation-phases.md) | Timeline, tasks, risks, success metrics | +| [07-observability-backends.md](./07-observability-backends.md) | Backend selection and architecture | +| [08-appendix.md](./08-appendix.md) | Glossary, references, version history | +| [09-data-collection-reference.md](./09-data-collection-reference.md) | Span/metric/dashboard inventory | +| [presentation.md](./presentation.md) | Slide deck for OTel plan overview | ### Task Lists diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md new file mode 100644 index 0000000000..475257b60a --- /dev/null +++ b/OpenTelemetryPlan/09-data-collection-reference.md @@ -0,0 +1,549 @@ +# Observability Data Collection Reference + +> **Audience**: Developers and operators. This is the single source of truth for all telemetry data collected by rippled's observability stack. +> +> **Related docs**: [docs/telemetry-runbook.md](../docs/telemetry-runbook.md) (operator runbook with alerting and troubleshooting) | [03-implementation-strategy.md](./03-implementation-strategy.md) (code structure and performance optimization) | [04-code-samples.md](./04-code-samples.md) (C++ instrumentation examples) + +## Data Flow Overview + +```mermaid +graph LR + subgraph rippledNode["rippled Node"] + A["Trace Macros
XRPL_TRACE_SPAN
(OTLP/HTTP exporter)"] + B["beast::insight
StatsD metrics
(UDP sender)"] + end + + subgraph collector["OTel Collector :4317 / :4318 / :8125"] + direction TB + R1["OTLP Receiver
:4317 gRPC | :4318 HTTP"] + R2["StatsD Receiver
:8125 UDP"] + BP["Batch Processor
timeout 1s, batch 100"] + SM["SpanMetrics Connector
derives RED metrics
from trace spans"] + + R1 --> BP + BP --> SM + end + + subgraph backends["Trace Backend"] + D["Grafana Tempo :3200
TraceQL search &
S3/GCS long-term storage"] + end + + subgraph metrics["Metrics Stack"] + E["Prometheus :9090
scrapes :8889
span-derived + StatsD metrics"] + end + + subgraph viz["Visualization"] + F["Grafana :3000
10 dashboards"] + end + + A -->|"OTLP/HTTP :4318
(traces + attributes)"| R1 + B -->|"UDP :8125
(gauges, counters, timers)"| R2 + + BP -->|"OTLP/gRPC :4317"| D + + SM -->|"span_calls_total
span_duration_ms
(6 dimension labels)"| E + R2 -->|"rippled_* gauges
rippled_* counters
rippled_* summaries"| E + + E -->|"Prometheus
data source"| F + D -->|"Tempo
data source"| F + + style A fill:#4a90d9,color:#fff,stroke:#2a6db5 + style B fill:#d9534f,color:#fff,stroke:#b52d2d + style R1 fill:#5cb85c,color:#fff,stroke:#3d8b3d + style R2 fill:#5cb85c,color:#fff,stroke:#3d8b3d + style BP fill:#449d44,color:#fff,stroke:#2d6e2d + style SM fill:#449d44,color:#fff,stroke:#2d6e2d + style D fill:#f0ad4e,color:#000,stroke:#c78c2e + style E fill:#f0ad4e,color:#000,stroke:#c78c2e + style F fill:#5bc0de,color:#000,stroke:#3aa8c1 + style rippledNode fill:#1a2633,color:#ccc,stroke:#4a90d9 + style collector fill:#1a3320,color:#ccc,stroke:#5cb85c + style backends fill:#332a1a,color:#ccc,stroke:#f0ad4e + style metrics fill:#332a1a,color:#ccc,stroke:#f0ad4e + style viz fill:#1a2d33,color:#ccc,stroke:#5bc0de +``` + +There are two independent telemetry pipelines entering a single **OTel Collector**: + +1. **OpenTelemetry Traces** — Distributed spans with attributes, exported via OTLP/HTTP (:4318) to the collector's **OTLP Receiver**. The **Batch Processor** groups spans (1s timeout, batch size 100) before forwarding to trace backends. The **SpanMetrics Connector** derives RED metrics (rate, errors, duration) from every span and feeds them into the metrics pipeline. +2. **beast::insight StatsD** — System-level gauges, counters, and timers emitted as StatsD UDP packets to port :8125, ingested by the collector's **StatsD Receiver**, and exported alongside span-derived metrics to Prometheus. + +**Trace backend** — The collector exports traces via OTLP/gRPC to: + +- **Grafana Tempo** — Preferred trace backend. Supports TraceQL queries at `:3200`, S3/GCS object storage for cost-effective long-term trace retention, and integrates natively with Grafana. + +> **Further reading**: [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) for core OpenTelemetry concepts (traces, spans, context propagation, sampling). [07-observability-backends.md](./07-observability-backends.md) for production backend selection, collector placement, and sampling strategies. + +--- + +## 1. OpenTelemetry Spans + +### 1.1 Complete Span Inventory (16 spans) + +> **See also**: [02-design-decisions.md §2.3](./02-design-decisions.md#23-span-naming-conventions) for naming conventions and the full span catalog with rationale. [04-code-samples.md §4.6](./04-code-samples.md#46-span-flow-visualization) for span flow diagrams. + +#### RPC Spans + +Controlled by `trace_rpc=1` in `[telemetry]` config. + +| Span Name | Parent | Source File | Description | +| -------------------- | ------------- | ----------------- | ------------------------------------------------------------------------ | +| `rpc.request` | — | ServerHandler.cpp | Top-level HTTP RPC request entry point | +| `rpc.process` | `rpc.request` | ServerHandler.cpp | RPC processing pipeline | +| `rpc.ws_message` | — | ServerHandler.cpp | WebSocket message handling | +| `rpc.command.` | `rpc.process` | RPCHandler.cpp | Per-command span (e.g., `rpc.command.server_info`, `rpc.command.ledger`) | + +**Where to find**: Tempo → TraceQL: `{resource.service.name="rippled" && name=~"rpc.request|rpc.command.*"}` + +**Grafana dashboard**: _RPC Performance_ (`rippled-rpc-perf`) + +#### Transaction Spans + +Controlled by `trace_transactions=1` in `[telemetry]` config. + +| Span Name | Parent | Source File | Description | +| ------------ | -------------- | --------------- | ----------------------------------------------------------------- | +| `tx.process` | — | NetworkOPs.cpp | Transaction submission entry point (local or peer-relayed) | +| `tx.receive` | — | PeerImp.cpp | Raw transaction received from peer overlay (before deduplication) | +| `tx.apply` | `ledger.build` | BuildLedger.cpp | Transaction set applied to new ledger during consensus | + +**Where to find**: Tempo → TraceQL: `{resource.service.name="rippled" && name=~"tx.process|tx.receive"}` + +**Grafana dashboard**: _Transaction Overview_ (`rippled-transactions`) + +#### Consensus Spans + +Controlled by `trace_consensus=1` in `[telemetry]` config. + +| Span Name | Parent | Source File | Description | +| --------------------------- | ------ | ---------------- | --------------------------------------------- | +| `consensus.proposal.send` | — | RCLConsensus.cpp | Node broadcasts its transaction set proposal | +| `consensus.ledger_close` | — | RCLConsensus.cpp | Ledger close event triggered by consensus | +| `consensus.accept` | — | RCLConsensus.cpp | Consensus accepts a ledger (round complete) | +| `consensus.validation.send` | — | RCLConsensus.cpp | Validation message sent after ledger accepted | +| `consensus.accept.apply` | — | RCLConsensus.cpp | Ledger application with close time details | + +**Where to find**: Tempo → TraceQL: `{resource.service.name="rippled" && name=~"consensus.*"}` + +**Grafana dashboard**: _Consensus Health_ (`rippled-consensus`) + +#### Ledger Spans + +Controlled by `trace_ledger=1` in `[telemetry]` config. + +| Span Name | Parent | Source File | Description | +| ----------------- | ------ | ---------------- | ---------------------------------------------- | +| `ledger.build` | — | BuildLedger.cpp | Build new ledger from accepted transaction set | +| `ledger.validate` | — | LedgerMaster.cpp | Ledger promoted to validated status | +| `ledger.store` | — | LedgerMaster.cpp | Ledger stored to database/history | + +**Where to find**: Tempo → TraceQL: `{resource.service.name="rippled" && name=~"ledger.*"}` + +**Grafana dashboard**: _Ledger Operations_ (`rippled-ledger-ops`) + +#### Peer Spans + +Controlled by `trace_peer=1` in `[telemetry]` config. **Disabled by default** (high volume). + +| Span Name | Parent | Source File | Description | +| ------------------------- | ------ | ----------- | ------------------------------------- | +| `peer.proposal.receive` | — | PeerImp.cpp | Consensus proposal received from peer | +| `peer.validation.receive` | — | PeerImp.cpp | Validation message received from peer | + +**Where to find**: Tempo → TraceQL: `{resource.service.name="rippled" && name=~"peer.*"}` + +**Grafana dashboard**: _Peer Network_ (`rippled-peer-net`) + +--- + +### 1.2 Complete Attribute Inventory (22 attributes) + +> **See also**: [02-design-decisions.md §2.4.2](./02-design-decisions.md#242-span-attributes-by-category) for attribute design rationale and privacy considerations. + +Every span can carry key-value attributes that provide context for filtering and aggregation. + +#### RPC Attributes + +| Attribute | Type | Set On | Description | +| ------------------------ | ------ | --------------- | ------------------------------------------------ | +| `xrpl.rpc.command` | string | `rpc.command.*` | RPC command name (e.g., `server_info`, `ledger`) | +| `xrpl.rpc.version` | int64 | `rpc.command.*` | API version number | +| `xrpl.rpc.role` | string | `rpc.command.*` | Caller role: `"admin"` or `"user"` | +| `xrpl.rpc.status` | string | `rpc.command.*` | Result: `"success"` or `"error"` | +| `xrpl.rpc.duration_ms` | int64 | `rpc.command.*` | Command execution time in milliseconds | +| `xrpl.rpc.error_message` | string | `rpc.command.*` | Error details (only set on failure) | + +**Tempo query**: `{span.xrpl.rpc.command="server_info"}` to find all `server_info` calls. + +**Prometheus label**: `xrpl_rpc_command` (dots converted to underscores by SpanMetrics). + +#### Transaction Attributes + +| Attribute | Type | Set On | Description | +| -------------------- | ------- | -------------------------- | ---------------------------------------------------- | +| `xrpl.tx.hash` | string | `tx.process`, `tx.receive` | Transaction hash (hex-encoded) | +| `xrpl.tx.local` | boolean | `tx.process` | `true` if locally submitted, `false` if peer-relayed | +| `xrpl.tx.path` | string | `tx.process` | Submission path: `"sync"` or `"async"` | +| `xrpl.tx.suppressed` | boolean | `tx.receive` | `true` if transaction was suppressed (duplicate) | +| `xrpl.tx.status` | string | `tx.receive` | Transaction status (e.g., `"known_bad"`) | + +**Tempo query**: `{span.xrpl.tx.hash=""}` to trace a specific transaction across nodes. + +**Prometheus label**: `xrpl_tx_local` (used as SpanMetrics dimension). + +#### Consensus Attributes + +| Attribute | Type | Set On | Description | +| ------------------------------------ | ------- | --------------------------------------------------------------------------------------------------- | ------------------------------------------------------------- | +| `xrpl.consensus.round` | int64 | `consensus.proposal.send` | Consensus round number | +| `xrpl.consensus.mode` | string | `consensus.proposal.send`, `consensus.ledger_close` | Node mode: `"syncing"`, `"tracking"`, `"full"`, `"proposing"` | +| `xrpl.consensus.proposers` | int64 | `consensus.proposal.send`, `consensus.accept` | Number of proposers in the round | +| `xrpl.consensus.proposing` | boolean | `consensus.validation.send` | Whether this node was a proposer | +| `xrpl.consensus.ledger.seq` | int64 | `consensus.ledger_close`, `consensus.accept`, `consensus.validation.send`, `consensus.accept.apply` | Ledger sequence number | +| `xrpl.consensus.close_time` | int64 | `consensus.accept.apply` | Agreed-upon ledger close time (epoch seconds) | +| `xrpl.consensus.close_time_correct` | boolean | `consensus.accept.apply` | Whether validators reached agreement on close time | +| `xrpl.consensus.close_resolution_ms` | int64 | `consensus.accept.apply` | Close time rounding granularity in milliseconds | +| `xrpl.consensus.state` | string | `consensus.accept.apply` | Consensus outcome: `"finished"` or `"moved_on"` | +| `xrpl.consensus.round_time_ms` | int64 | `consensus.accept.apply` | Total consensus round duration in milliseconds | + +**Tempo query**: `{span.xrpl.consensus.mode="proposing"}` to find rounds where node was proposing. + +**Prometheus label**: `xrpl_consensus_mode` (used as SpanMetrics dimension). + +#### Ledger Attributes + +| Attribute | Type | Set On | Description | +| ------------------------- | ----- | ------------------------------------------------------------- | ---------------------------------------------- | +| `xrpl.ledger.seq` | int64 | `ledger.build`, `ledger.validate`, `ledger.store`, `tx.apply` | Ledger sequence number | +| `xrpl.ledger.validations` | int64 | `ledger.validate` | Number of validations received for this ledger | +| `xrpl.ledger.tx_count` | int64 | `ledger.build`, `tx.apply` | Transactions in the ledger | +| `xrpl.ledger.tx_failed` | int64 | `ledger.build`, `tx.apply` | Failed transactions in the ledger | + +**Tempo query**: `{span.xrpl.ledger.seq=12345}` to find all spans for a specific ledger. + +#### Peer Attributes + +| Attribute | Type | Set On | Description | +| ------------------------------ | ------- | ---------------------------------------------------------------- | ---------------------------------------------------- | +| `xrpl.peer.id` | int64 | `tx.receive`, `peer.proposal.receive`, `peer.validation.receive` | Peer identifier | +| `xrpl.peer.proposal.trusted` | boolean | `peer.proposal.receive` | Whether the proposal came from a trusted validator | +| `xrpl.peer.validation.trusted` | boolean | `peer.validation.receive` | Whether the validation came from a trusted validator | + +**Prometheus labels**: `xrpl_peer_proposal_trusted`, `xrpl_peer_validation_trusted` (SpanMetrics dimensions). + +--- + +### 1.3 SpanMetrics — Derived Prometheus Metrics + +> **See also**: [01-architecture-analysis.md](./01-architecture-analysis.md) §1.8.2 for how span-derived metrics map to operational insights. + +The OTel Collector's SpanMetrics connector automatically generates RED (Rate, Errors, Duration) metrics from every span. No custom metrics code in rippled is needed. + +| Prometheus Metric | Type | Description | +| -------------------------------------------------- | --------- | ------------------------------------------------------------------------------ | +| `traces_span_metrics_calls_total` | Counter | Total span invocations | +| `traces_span_metrics_duration_milliseconds_bucket` | Histogram | Latency distribution (buckets: 1, 5, 10, 25, 50, 100, 250, 500, 1000, 5000 ms) | +| `traces_span_metrics_duration_milliseconds_count` | Histogram | Observation count | +| `traces_span_metrics_duration_milliseconds_sum` | Histogram | Cumulative latency | + +**Standard labels on every metric**: `span_name`, `status_code`, `service_name`, `span_kind` + +**Additional dimension labels** (configured in `otel-collector-config.yaml`): + +| Span Attribute | Prometheus Label | Applies To | +| ------------------------------ | ------------------------------ | ------------------------- | +| `xrpl.rpc.command` | `xrpl_rpc_command` | `rpc.command.*` | +| `xrpl.rpc.status` | `xrpl_rpc_status` | `rpc.command.*` | +| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` | +| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` | +| `xrpl.peer.proposal.trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` | +| `xrpl.peer.validation.trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` | + +**Where to query**: Prometheus → `traces_span_metrics_calls_total{span_name="rpc.command.server_info"}` + +--- + +## 2. StatsD Metrics (beast::insight) + +> **See also**: [02-design-decisions.md](./02-design-decisions.md) for the beast::insight coexistence design. [06-implementation-phases.md](./06-implementation-phases.md) for the Phase 6 metric inventory. + +These are system-level metrics emitted by rippled's `beast::insight` framework via StatsD UDP. They cover operational data that doesn't map to individual trace spans. + +### Configuration + +```ini +[insight] +server=statsd +address=127.0.0.1:8125 +prefix=rippled +``` + +### 2.1 Gauges + +| Prometheus Metric | Source File | Description | Typical Range | +| --------------------------------------------------- | --------------------- | ----------------------------------------- | ------------------------------- | +| `rippled_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h | Seconds since last validated ledger | 0–10 (healthy), >30 (stale) | +| `rippled_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h | Seconds since last published ledger | 0–10 (healthy) | +| `rippled_State_Accounting_Disconnected_duration` | NetworkOPs.cpp | Cumulative seconds in Disconnected state | Monotonic | +| `rippled_State_Accounting_Connected_duration` | NetworkOPs.cpp | Cumulative seconds in Connected state | Monotonic | +| `rippled_State_Accounting_Syncing_duration` | NetworkOPs.cpp | Cumulative seconds in Syncing state | Monotonic | +| `rippled_State_Accounting_Tracking_duration` | NetworkOPs.cpp | Cumulative seconds in Tracking state | Monotonic | +| `rippled_State_Accounting_Full_duration` | NetworkOPs.cpp | Cumulative seconds in Full state | Monotonic (should dominate) | +| `rippled_State_Accounting_Disconnected_transitions` | NetworkOPs.cpp | Count of transitions to Disconnected | Low | +| `rippled_State_Accounting_Connected_transitions` | NetworkOPs.cpp | Count of transitions to Connected | Low | +| `rippled_State_Accounting_Syncing_transitions` | NetworkOPs.cpp | Count of transitions to Syncing | Low | +| `rippled_State_Accounting_Tracking_transitions` | NetworkOPs.cpp | Count of transitions to Tracking | Low | +| `rippled_State_Accounting_Full_transitions` | NetworkOPs.cpp | Count of transitions to Full | Low (should be 1 after startup) | +| `rippled_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp | Active inbound peer connections | 0–85 | +| `rippled_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp | Active outbound peer connections | 10–21 | +| `rippled_Overlay_Peer_Disconnects` | OverlayImpl.cpp | Cumulative peer disconnection count | Low growth | +| `rippled_Overlay_Peer_Disconnects_Charges` | OverlayImpl.cpp | Disconnects due to resource limit charges | Low growth (subset of above) | +| `rippled_job_count` | JobQueue.cpp | Current job queue depth | 0–100 (healthy) | + +**Grafana dashboard**: _Node Health (StatsD)_ (`rippled-statsd-node-health`) + +### 2.2 Counters + +| Prometheus Metric | Source File | Description | +| --------------------------------- | ------------------ | --------------------------------------------- | +| `rippled_rpc_requests` | ServerHandler.cpp | Total RPC requests received | +| `rippled_ledger_fetches` | InboundLedgers.cpp | Inbound ledger fetch attempts | +| `rippled_ledger_history_mismatch` | LedgerHistory.cpp | Ledger hash mismatches detected | +| `rippled_warn` | Logic.h | Resource manager warnings issued | +| `rippled_drop` | Logic.h | Resource manager drops (connections rejected) | + +**Note**: `rippled_warn` and `rippled_drop` use non-standard StatsD meter type (`|m`). The OTel StatsD receiver only recognizes `|c`, `|g`, `|ms`, `|h`, `|s` — these metrics may be silently dropped. See Known Issues below. + +**Grafana dashboard**: _RPC & Pathfinding (StatsD)_ (`rippled-statsd-rpc`) + +### 2.3 Histograms (from StatsD timers) + +| Prometheus Metric | Source File | Unit | Description | +| ----------------------- | ----------------- | ----- | ------------------------------ | +| `rippled_rpc_time` | ServerHandler.cpp | ms | RPC response time distribution | +| `rippled_rpc_size` | ServerHandler.cpp | bytes | RPC response size distribution | +| `rippled_ios_latency` | Application.cpp | ms | I/O service loop latency | +| `rippled_pathfind_fast` | PathRequests.h | ms | Fast pathfinding duration | +| `rippled_pathfind_full` | PathRequests.h | ms | Full pathfinding duration | + +Quantiles collected: 0th, 50th, 90th, 95th, 99th, 100th percentile. + +**Grafana dashboards**: _Node Health_ (`ios_latency`), _RPC & Pathfinding_ (`rpc_time`, `rpc_size`, `pathfind_*`) + +### 2.4 Overlay Traffic Metrics + +For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), four gauges are emitted: + +- `rippled_{category}_Bytes_In` +- `rippled_{category}_Bytes_Out` +- `rippled_{category}_Messages_In` +- `rippled_{category}_Messages_Out` + +**Key categories**: + +| Category | Description | +| ----------------------------------------------------------------- | -------------------------- | +| `total` | All traffic aggregated | +| `overhead` / `overhead_overlay` | Protocol overhead | +| `transactions` / `transactions_duplicate` | Transaction relay | +| `proposals` / `proposals_untrusted` / `proposals_duplicate` | Consensus proposals | +| `validations` / `validations_untrusted` / `validations_duplicate` | Consensus validations | +| `ledger_data_get` / `ledger_data_share` | Ledger data exchange | +| `ledger_data_Transaction_Node_get/share` | Transaction node data | +| `ledger_data_Account_State_Node_get/share` | Account state node data | +| `ledger_data_Transaction_Set_candidate_get/share` | Transaction set candidates | +| `getObject` / `haveTxSet` / `ledgerData` | Object requests | +| `ping` / `status` | Keepalive and status | +| `set_get` | Set requests | + +**Grafana dashboards**: _Network Traffic_ (`rippled-statsd-network`), _Overlay Traffic Detail_ (`rippled-statsd-overlay-detail`), _Ledger Data & Sync_ (`rippled-statsd-ledger-sync`) + +--- + +## 3. Grafana Dashboard Reference + +> **See also**: [05-configuration-reference.md](./05-configuration-reference.md) §5.8 for Grafana data source provisioning (Tempo, Prometheus) and TraceQL query examples. + +### 3.1 Span-Derived Dashboards (5) + +| Dashboard | UID | Data Source | Key Panels | +| -------------------- | ---------------------- | ------------------------ | ---------------------------------------------------------------------------------- | +| RPC Performance | `rippled-rpc-perf` | Prometheus (SpanMetrics) | Request rate by command, p95 latency by command, error rate, heatmap, top commands | +| Transaction Overview | `rippled-transactions` | Prometheus (SpanMetrics) | Processing rate, latency p95/p50, local vs relay split, apply duration, heatmap | +| Consensus Health | `rippled-consensus` | Prometheus (SpanMetrics) | Round duration p95/p50, proposals rate, close duration, mode timeline, heatmap | +| Ledger Operations | `rippled-ledger-ops` | Prometheus (SpanMetrics) | Build rate, build duration, validation rate, store rate, build vs close comparison | +| Peer Network | `rippled-peer-net` | Prometheus (SpanMetrics) | Proposal receive rate, validation receive rate, trusted vs untrusted breakdown | + +### 3.2 StatsD Dashboards (5) + +| Dashboard | UID | Data Source | Key Panels | +| ---------------------- | ------------------------------- | ------------------- | --------------------------------------------------------------------------------- | +| Node Health | `rippled-statsd-node-health` | Prometheus (StatsD) | Ledger age, operating mode, I/O latency, job queue, fetch rate | +| Network Traffic | `rippled-statsd-network` | Prometheus (StatsD) | Active peers, disconnects, bytes in/out, messages in/out, traffic by category | +| RPC & Pathfinding | `rippled-statsd-rpc` | Prometheus (StatsD) | RPC rate, response time/size, pathfinding duration, resource warnings/drops | +| Overlay Traffic Detail | `rippled-statsd-overlay-detail` | Prometheus (StatsD) | Squelch, overhead, validator lists, set get/share, have/requested tx, proof paths | +| Ledger Data & Sync | `rippled-statsd-ledger-sync` | Prometheus (StatsD) | Ledger data exchange, legacy ledger share/get, getobject by type, traffic heatmap | + +### 3.3 Accessing the Dashboards + +1. Open Grafana at **http://localhost:3000** +2. Navigate to **Dashboards → rippled** folder +3. All 10 dashboards are auto-provisioned from `docker/telemetry/grafana/dashboards/` + +--- + +## 4. Tempo Trace Search Guide + +> **See also**: [08-appendix.md](./08-appendix.md) §8.2 for span hierarchy visualizations. [05-configuration-reference.md](./05-configuration-reference.md) §5.8.5 for TraceQL query examples. + +### Finding Traces by Type + +| What to Find | Tempo TraceQL Query | +| ------------------------ | -------------------------------------------------------------------------------- | +| All RPC calls | `{resource.service.name="rippled" && name="rpc.request"}` | +| Specific RPC command | `{resource.service.name="rippled" && name="rpc.command.server_info"}` | +| Slow RPC calls | `{resource.service.name="rippled" && name=~"rpc.command.*"} \| duration > 100ms` | +| Failed RPC calls | `{span.xrpl.rpc.status="error"}` | +| Specific transaction | `{span.xrpl.tx.hash=""}` | +| Local transactions only | `{span.xrpl.tx.local=true}` | +| Consensus rounds | `{resource.service.name="rippled" && name="consensus.accept"}` | +| Rounds by mode | `{span.xrpl.consensus.mode="proposing"}` | +| Specific ledger | `{span.xrpl.ledger.seq=12345}` | +| Peer proposals (trusted) | `{span.xrpl.peer.proposal.trusted=true}` | + +### Trace Structure + +A typical RPC trace shows the span hierarchy: + +``` +rpc.request (ServerHandler) + └── rpc.process (ServerHandler) + └── rpc.command.server_info (RPCHandler) +``` + +A consensus round produces independent spans (not parent-child): + +``` +consensus.ledger_close (close event) +consensus.proposal.send (broadcast proposal) +ledger.build (build new ledger) + └── tx.apply (apply transaction set) +consensus.accept (accept result) +consensus.validation.send (send validation) +ledger.validate (promote to validated) +ledger.store (persist to DB) +``` + +--- + +## 5. Prometheus Query Examples + +> **See also**: [05-configuration-reference.md](./05-configuration-reference.md) §5.8.7 for correlating Prometheus StatsD metrics with trace-derived metrics. + +### Span-Derived Metrics + +```promql +# RPC request rate by command (last 5 minutes) +sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~"rpc.command.*"}[5m])) + +# RPC p95 latency by command +histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m]))) + +# Consensus round duration p95 +histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name="consensus.accept"}[5m]))) + +# Transaction processing rate (local vs relay) +sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m])) + +# Trusted vs untrusted proposal rate +sum by (xrpl_peer_proposal_trusted) (rate(traces_span_metrics_calls_total{span_name="peer.proposal.receive"}[5m])) +``` + +### StatsD Metrics + +```promql +# Validated ledger age (should be < 10s) +rippled_LedgerMaster_Validated_Ledger_Age + +# Active peer count +rippled_Peer_Finder_Active_Inbound_Peers + rippled_Peer_Finder_Active_Outbound_Peers + +# RPC response time p95 +histogram_quantile(0.95, rippled_rpc_time_bucket) + +# Total network bytes in (rate) +rate(rippled_total_Bytes_In[5m]) + +# Operating mode (should be "Full" after startup) +rippled_State_Accounting_Full_duration +``` + +--- + +## 6. Known Issues + +| Issue | Impact | Status | +| ------------------------------------------------------------------ | ------------------------------------------------ | -------------------------------------------------------------------- | +| `warn` and `drop` metrics use non-standard StatsD `\|m` meter type | Metrics silently dropped by OTel StatsD receiver | Phase 6 Task 6.1 — needs `\|m` → `\|c` change in StatsDCollector.cpp | +| `rippled_job_count` may not emit in standalone mode | Missing from Prometheus in some test configs | Requires active job queue activity | +| `rippled_rpc_requests` depends on `[insight]` config | Zero series if StatsD not configured | Requires `[insight] server=statsd` in xrpld.cfg | +| Peer tracing disabled by default | No `peer.*` spans unless `trace_peer=1` | Intentional — high volume on mainnet | + +--- + +## 7. Privacy and Data Collection + +The telemetry system is designed with privacy in mind: + +- **No private keys** are ever included in spans or metrics +- **No account balances** or financial data is traced +- **Transaction hashes** are included (public on-ledger data) but not transaction contents +- **Peer IDs** are internal identifiers, not IP addresses +- **All telemetry is opt-in** — disabled by default at build time (`-Dtelemetry=OFF`) +- **Sampling** reduces data volume — `sampling_ratio=0.01` recommended for production +- **Data stays local** — the default stack sends data to `localhost` only + +--- + +## 8. Configuration Quick Reference + +> **Full reference**: [05-configuration-reference.md](./05-configuration-reference.md) §5.1 for all `[telemetry]` options with defaults, the config parser implementation, and collector YAML configurations (dev and production). + +### Minimal Setup (development) + +```ini +[telemetry] +enabled=1 + +[insight] +server=statsd +address=127.0.0.1:8125 +prefix=rippled +``` + +### Production Setup + +```ini +[telemetry] +enabled=1 +endpoint=http://otel-collector:4318/v1/traces +sampling_ratio=0.01 +trace_peer=0 +batch_size=1024 +max_queue_size=4096 + +[insight] +server=statsd +address=otel-collector:8125 +prefix=rippled +``` + +### Trace Category Toggle + +| Config Key | Default | Controls | +| -------------------- | ------- | ---------------------------- | +| `trace_rpc` | `1` | `rpc.*` spans | +| `trace_transactions` | `1` | `tx.*` spans | +| `trace_consensus` | `1` | `consensus.*` spans | +| `trace_ledger` | `1` | `ledger.*` spans | +| `trace_peer` | `0` | `peer.*` spans (high volume) | diff --git a/OpenTelemetryPlan/OpenTelemetryPlan.md b/OpenTelemetryPlan/OpenTelemetryPlan.md index 1161b99015..2bd6f07868 100644 --- a/OpenTelemetryPlan/OpenTelemetryPlan.md +++ b/OpenTelemetryPlan/OpenTelemetryPlan.md @@ -55,6 +55,7 @@ flowchart TB backends["07-observability-backends.md"] appendix["08-appendix.md"] poc["POC_taskList.md"] + dataref["09-data-collection-reference.md"] end overview --> fundamentals @@ -71,6 +72,7 @@ flowchart TB phases --> backends backends --> appendix phases --> poc + appendix --> dataref style overview fill:#1b5e20,stroke:#0d3d14,color:#fff,stroke-width:2px style fundamentals fill:#00695c,stroke:#004d40,color:#fff @@ -87,6 +89,7 @@ flowchart TB style backends fill:#4a148c,stroke:#2e0d57,color:#fff style appendix fill:#4a148c,stroke:#2e0d57,color:#fff style poc fill:#4a148c,stroke:#2e0d57,color:#fff + style dataref fill:#4a148c,stroke:#2e0d57,color:#fff ``` @@ -95,18 +98,19 @@ flowchart TB ## Table of Contents -| Section | Document | Description | -| ------- | ---------------------------------------------------------- | ---------------------------------------------------------------------- | -| **0** | [Tracing Fundamentals](./00-tracing-fundamentals.md) | Distributed tracing concepts, span relationships, context propagation | -| **1** | [Architecture Analysis](./01-architecture-analysis.md) | xrpld component analysis, trace points, instrumentation priorities | -| **2** | [Design Decisions](./02-design-decisions.md) | SDK selection, exporters, span naming, attributes, context propagation | -| **3** | [Implementation Strategy](./03-implementation-strategy.md) | Directory structure, key principles, performance optimization | -| **4** | [Code Samples](./04-code-samples.md) | C++ implementation examples for core infrastructure and key modules | -| **5** | [Configuration Reference](./05-configuration-reference.md) | xrpld config, CMake integration, Collector configurations | -| **6** | [Implementation Phases](./06-implementation-phases.md) | 5-phase timeline, tasks, risks, success metrics | -| **7** | [Observability Backends](./07-observability-backends.md) | Backend selection guide and production architecture | -| **8** | [Appendix](./08-appendix.md) | Glossary, references, version history | -| **POC** | [POC Task List](./POC_taskList.md) | Proof of concept tasks for RPC tracing end-to-end demo | +| Section | Document | Description | +| ------- | -------------------------------------------------------------- | ---------------------------------------------------------------------- | +| **0** | [Tracing Fundamentals](./00-tracing-fundamentals.md) | Distributed tracing concepts, span relationships, context propagation | +| **1** | [Architecture Analysis](./01-architecture-analysis.md) | xrpld component analysis, trace points, instrumentation priorities | +| **2** | [Design Decisions](./02-design-decisions.md) | SDK selection, exporters, span naming, attributes, context propagation | +| **3** | [Implementation Strategy](./03-implementation-strategy.md) | Directory structure, key principles, performance optimization | +| **4** | [Code Samples](./04-code-samples.md) | C++ implementation examples for core infrastructure and key modules | +| **5** | [Configuration Reference](./05-configuration-reference.md) | xrpld config, CMake integration, Collector configurations | +| **6** | [Implementation Phases](./06-implementation-phases.md) | 5-phase timeline, tasks, risks, success metrics | +| **7** | [Observability Backends](./07-observability-backends.md) | Backend selection guide and production architecture | +| **8** | [Appendix](./08-appendix.md) | Glossary, references, version history | +| **9** | [Data Collection Reference](./09-data-collection-reference.md) | Complete inventory of spans, attributes, metrics, and dashboards | +| **POC** | [POC Task List](./POC_taskList.md) | Proof of concept tasks for RPC tracing end-to-end demo | --- @@ -220,6 +224,14 @@ The appendix contains a glossary of OpenTelemetry and xrpld-specific terms, refe --- +## 9. Data Collection Reference + +A single-source-of-truth reference documenting every piece of telemetry data collected by rippled. Covers all 16 OpenTelemetry spans with their 22 attributes, all StatsD metrics (gauges, counters, histograms, overlay traffic), SpanMetrics-derived Prometheus metrics, and all 8 Grafana dashboards. Includes Jaeger search guides and Prometheus query examples. + +➡️ **[View Data Collection Reference](./09-data-collection-reference.md)** + +--- + ## POC Task List A step-by-step task list for building a minimal end-to-end proof of concept that demonstrates distributed tracing in xrpld. The POC scope is limited to RPC tracing — showing request traces flowing from xrpld through an OpenTelemetry Collector into Tempo, viewable in Grafana. diff --git a/docker/telemetry/TESTING.md b/docker/telemetry/TESTING.md index 874c7b40c3..45a2541c0d 100644 --- a/docker/telemetry/TESTING.md +++ b/docker/telemetry/TESTING.md @@ -374,21 +374,27 @@ See the "Verification Queries" section below. ## Expected Span Catalog -All 12 production span names instrumented across Phases 2-4: +All 16 production span names instrumented across Phases 2-5: -| Span Name | Source File | Phase | Key Attributes | How to Trigger | -| --------------------------- | --------------------- | ----- | --------------------------------------------------------------------------------- | ------------------------- | -| `rpc.request` | ServerHandler.cpp:271 | 2 | -- | Any HTTP RPC call | -| `rpc.process` | ServerHandler.cpp:573 | 2 | -- | Any HTTP RPC call | -| `rpc.ws_message` | ServerHandler.cpp:384 | 2 | -- | WebSocket RPC message | -| `rpc.command.` | RPCHandler.cpp:161 | 2 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role` | Any RPC command | -| `tx.process` | NetworkOPs.cpp:1227 | 3 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Submit transaction | -| `tx.receive` | PeerImp.cpp:1273 | 3 | `xrpl.peer.id` | Peer relays transaction | -| `consensus.proposal.send` | RCLConsensus.cpp:177 | 4 | `xrpl.consensus.round` | Consensus proposing phase | -| `consensus.ledger_close` | RCLConsensus.cpp:282 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | -| `consensus.accept` | RCLConsensus.cpp:395 | 4 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted | -| `consensus.validation.send` | RCLConsensus.cpp:753 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent | -| `consensus.accept.apply` | RCLConsensus.cpp:453 | 4 | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state` | Ledger apply + close time | +| Span Name | Source File | Phase | Key Attributes | How to Trigger | +| --------------------------- | --------------------- | ----- | ---------------------------------------------------------------------------------------- | ------------------------- | +| `rpc.request` | ServerHandler.cpp:271 | 2 | -- | Any HTTP RPC call | +| `rpc.process` | ServerHandler.cpp:573 | 2 | -- | Any HTTP RPC call | +| `rpc.ws_message` | ServerHandler.cpp:384 | 2 | -- | WebSocket RPC message | +| `rpc.command.` | RPCHandler.cpp:161 | 2 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role` | Any RPC command | +| `tx.process` | NetworkOPs.cpp:1227 | 3 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Submit transaction | +| `tx.receive` | PeerImp.cpp:1273 | 3 | `xrpl.peer.id` | Peer relays transaction | +| `consensus.proposal.send` | RCLConsensus.cpp:177 | 4 | `xrpl.consensus.round` | Consensus proposing phase | +| `consensus.ledger_close` | RCLConsensus.cpp:282 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | +| `consensus.accept` | RCLConsensus.cpp:395 | 4 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted | +| `consensus.validation.send` | RCLConsensus.cpp:753 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent | +| `consensus.accept.apply` | RCLConsensus.cpp:453 | 4 | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state` | Ledger apply + close time | +| `tx.apply` | BuildLedger.cpp:88 | 5 | `xrpl.ledger.tx_count`, `xrpl.ledger.tx_failed` | Ledger close (tx set) | +| `ledger.build` | BuildLedger.cpp:31 | 5 | `xrpl.ledger.seq`, `xrpl.ledger.close_time`, `close_time_correct`, `close_resolution_ms` | Ledger build | +| `ledger.validate` | LedgerMaster.cpp:915 | 5 | `xrpl.ledger.seq`, `xrpl.ledger.validations` | Ledger validated | +| `ledger.store` | LedgerMaster.cpp:409 | 5 | `xrpl.ledger.seq` | Ledger stored | +| `peer.proposal.receive` | PeerImp.cpp:1667 | 5 | `xrpl.peer.id`, `xrpl.peer.proposal.trusted` | Peer sends proposal | +| `peer.validation.receive` | PeerImp.cpp:2264 | 5 | `xrpl.peer.id`, `xrpl.peer.validation.trusted` | Peer sends validation | --- @@ -407,10 +413,12 @@ curl -s "$TEMPO/api/v2/search/tag/resource.service.name/values" | jq '.tagValues # Query traces by operation for op in "rpc.request" "rpc.process" \ "rpc.command.server_info" "rpc.command.server_state" "rpc.command.ledger" \ - "tx.process" "tx.receive" \ + "tx.process" "tx.receive" "tx.apply" \ "consensus.proposal.send" "consensus.ledger_close" \ "consensus.accept" "consensus.accept.apply" \ - "consensus.validation.send"; do + "consensus.validation.send" \ + "ledger.build" "ledger.validate" "ledger.store" \ + "peer.proposal.receive" "peer.validation.receive"; do count=$(curl -s "$TEMPO/api/search" \ --data-urlencode "q={resource.service.name=\"xrpld\" && name=\"$op\"}" \ --data-urlencode "limit=5" \ @@ -445,9 +453,11 @@ Open http://localhost:3000 (anonymous admin access enabled). Pre-configured dashboards: -- **RPC Performance**: Request rates, latency percentiles by command -- **Transaction Overview**: Transaction processing rates and paths -- **Consensus Health**: Consensus round duration and proposer counts +- **RPC Performance**: Request rates, latency percentiles by command, top commands, WebSocket rate +- **Transaction Overview**: Transaction processing rates, apply duration, peer relay, failed tx rate +- **Consensus Health**: Consensus round duration, proposer counts, mode tracking, accept heatmap +- **Ledger Operations**: Build/validate/store rates and durations, TX apply metrics +- **Peer Network**: Proposal/validation receive rates, trusted vs untrusted breakdown (requires `trace_peer=1`) Pre-configured datasources: diff --git a/docker/telemetry/docker-compose.yml b/docker/telemetry/docker-compose.yml index caf84b9767..30cf81b849 100644 --- a/docker/telemetry/docker-compose.yml +++ b/docker/telemetry/docker-compose.yml @@ -24,10 +24,11 @@ services: image: otel/opentelemetry-collector-contrib:0.121.0 command: ["--config=/etc/otel-collector-config.yaml"] ports: - - "4317:4317" # OTLP gRPC receiver - - "4318:4318" # OTLP HTTP receiver (xrpld sends traces here) - - "8889:8889" # Prometheus metrics (spanmetrics) - - "13133:13133" # Health check endpoint + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + - "8125:8125/udp" # StatsD UDP (beast::insight metrics) + - "8889:8889" # Prometheus metrics (spanmetrics + statsd) + - "13133:13133" # Health check volumes: # Mount collector pipeline config (receivers → processors → exporters) - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json index ef202e7353..8b3719dd34 100644 --- a/docker/telemetry/grafana/dashboards/consensus-health.json +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -10,6 +10,7 @@ "panels": [ { "title": "Consensus Round Duration", + "description": "p95 and p50 duration of consensus accept rounds. The consensus.accept span (RCLConsensus.cpp:395) measures the time to process an accepted ledger including transaction application and state finalization. The span carries xrpl.consensus.proposers and xrpl.consensus.round_time_ms attributes. Normal range is 3-6 seconds on mainnet.", "type": "timeseries", "gridPos": { "h": 8, @@ -17,31 +18,45 @@ "x": 0, "y": 0 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept\"}[5m])))", - "legendFormat": "P95 Round Duration" + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept\"}[5m])))", + "legendFormat": "P95 Round Duration [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept\"}[5m])))", - "legendFormat": "P50 Round Duration" + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept\"}[5m])))", + "legendFormat": "P50 Round Duration [{{exported_instance}}]" } ], "fieldConfig": { "defaults": { - "unit": "ms" + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "Consensus Proposals Sent Rate", + "description": "Rate at which this node sends consensus proposals to the network. Sourced from the consensus.proposal.send span (RCLConsensus.cpp:177) which fires each time the node proposes a transaction set. The span carries xrpl.consensus.round identifying the consensus round number. A healthy proposing node should show steady proposal output.", "type": "timeseries", "gridPos": { "h": 8, @@ -49,24 +64,38 @@ "x": 12, "y": 0 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.proposal.send\"}[5m]))", - "legendFormat": "Proposals / Sec" + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.proposal.send\"}[5m]))", + "legendFormat": "Proposals / Sec [{{exported_instance}}]" } ], "fieldConfig": { "defaults": { - "unit": "ops" + "unit": "ops", + "custom": { + "axisLabel": "Proposals / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "Ledger Close Duration", + "description": "p95 duration of the ledger close event. The consensus.ledger_close span (RCLConsensus.cpp:282) measures the time from when consensus triggers a ledger close to completion. Carries xrpl.consensus.ledger.seq and xrpl.consensus.mode attributes. Compare with Consensus Round Duration to understand how close timing relates to overall round time.", "type": "timeseries", "gridPos": { "h": 8, @@ -74,24 +103,38 @@ "x": 0, "y": 8 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.ledger_close\"}[5m])))", - "legendFormat": "P95 Close Duration" + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.ledger_close\"}[5m])))", + "legendFormat": "P95 Close Duration [{{exported_instance}}]" } ], "fieldConfig": { "defaults": { - "unit": "ms" + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "Validation Send Rate", + "description": "Rate at which this node sends ledger validations to the network. Sourced from the consensus.validation.send span (RCLConsensus.cpp:753). Each validation confirms the node has fully validated a ledger. The span carries xrpl.consensus.ledger.seq and xrpl.consensus.proposing. Should closely track the ledger close rate when the node is healthy.", "type": "stat", "gridPos": { "h": 8, @@ -99,13 +142,19 @@ "x": 12, "y": 8 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.validation.send\"}[5m]))", - "legendFormat": "Validations / Sec" + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.validation.send\"}[5m]))", + "legendFormat": "Validations / Sec [{{exported_instance}}]" } ], "fieldConfig": { @@ -130,15 +179,15 @@ "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))", - "legendFormat": "P95 Apply Duration" + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))", + "legendFormat": "P95 Apply Duration [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))", - "legendFormat": "P50 Apply Duration" + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))", + "legendFormat": "P50 Apply Duration [{{exported_instance}}]" } ], "fieldConfig": { @@ -170,8 +219,8 @@ "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\"}[5m]))", - "legendFormat": "Total Rounds / Sec" + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\"}[5m]))", + "legendFormat": "Total Rounds / Sec [{{exported_instance}}]" } ], "fieldConfig": { @@ -187,6 +236,167 @@ }, "overrides": [] } + }, + { + "title": "Consensus Mode Over Time", + "description": "Breakdown of consensus ledger close events by the node's consensus mode (Proposing, Observing, Wrong Ledger, Switched Ledger). Grouped by the xrpl.consensus.mode span attribute from consensus.ledger_close. A healthy validator should be predominantly in Proposing mode. Frequent Wrong Ledger or Switched Ledger indicates sync issues.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (xrpl_consensus_mode, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_consensus_mode=~\"$consensus_mode\", span_name=\"consensus.ledger_close\"}[5m]))", + "legendFormat": "{{xrpl_consensus_mode}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Events / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Accept vs Close Rate", + "description": "Compares the rate of consensus.accept (ledger accepted after consensus) vs consensus.ledger_close (ledger close initiated). These should track closely in a healthy network. A divergence means some close events are not completing the accept phase, potentially indicating consensus failures or timeouts.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept\"}[5m]))", + "legendFormat": "Accepts / Sec [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.ledger_close\"}[5m]))", + "legendFormat": "Closes / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Events / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Validation vs Close Rate", + "description": "Compares the rate of consensus.validation.send vs consensus.ledger_close. Each validated ledger should produce one validation message. If validations lag behind closes, the node may be falling behind on validation or experiencing issues with the validation pipeline.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.validation.send\"}[5m]))", + "legendFormat": "Validations / Sec [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.ledger_close\"}[5m]))", + "legendFormat": "Closes / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Events / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Consensus Accept Duration Heatmap", + "description": "Heatmap showing the distribution of consensus.accept span durations across histogram buckets over time. Each cell represents how many accept events fell into that duration bucket in a 5m window. Useful for detecting outlier consensus rounds that take abnormally long.", + "type": "heatmap", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "yAxis": { + "axisLabel": "Duration (ms)" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept\"}[5m])) by (le)", + "legendFormat": "{{le}}", + "format": "heatmap" + } + ] } ], "schemaVersion": 39, @@ -196,7 +406,7 @@ { "name": "node", "label": "Node", - "description": "Filter by rippled node (service.instance.id \u2014 e.g. Node-1)", + "description": "Filter by rippled node (service.instance.id — e.g. Node-1)", "type": "query", "query": "label_values(traces_span_metrics_calls_total, exported_instance)", "datasource": { @@ -239,6 +449,6 @@ "from": "now-1h", "to": "now" }, - "title": "rippled Consensus Health", + "title": "Consensus Health", "uid": "rippled-consensus" } diff --git a/docker/telemetry/grafana/dashboards/ledger-operations.json b/docker/telemetry/grafana/dashboards/ledger-operations.json new file mode 100644 index 0000000000..67711e4fa8 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/ledger-operations.json @@ -0,0 +1,353 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Ledger Build Rate", + "description": "Rate at which new ledgers are being built. The ledger.build span (BuildLedger.cpp:31) wraps the entire buildLedgerImpl() function which creates a new ledger from a parent, applies transactions, flushes SHAMap nodes, and sets the accepted state. Should match the consensus close rate (~0.25/sec on mainnet with ~4s rounds).", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m]))", + "legendFormat": "Builds / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Ledger Build Duration", + "description": "p95 and p50 duration of ledger builds. Measures the full buildLedgerImpl() call including transaction application, SHAMap flushing, and ledger acceptance. The span records xrpl.ledger.seq as an attribute. Long build times indicate expensive transaction sets or I/O pressure from SHAMap flushes.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m])))", + "legendFormat": "P95 Build Duration [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m])))", + "legendFormat": "P50 Build Duration [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Ledger Validation Rate", + "description": "Rate at which ledgers pass the validation threshold and are accepted as fully validated. The ledger.validate span (LedgerMaster.cpp:915) fires in checkAccept() only after the ledger receives sufficient trusted validations (>= quorum). Records xrpl.ledger.seq and xrpl.ledger.validations (the number of validations received).", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"ledger.validate\"}[5m]))", + "legendFormat": "Validations / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Ledger Build Duration Heatmap", + "description": "Heatmap showing the distribution of ledger.build durations across histogram buckets over time. Each cell represents the count of ledger builds that fell into that duration bucket in a 5m window. Useful for spotting occasional slow ledger builds that may not appear in percentile charts.", + "type": "heatmap", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "yAxis": { + "axisLabel": "Duration (ms)" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m])) by (le)", + "legendFormat": "{{le}}", + "format": "heatmap" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms" + }, + "overrides": [] + } + }, + { + "title": "Transaction Apply Duration", + "description": "p95 and p50 duration of applying the consensus transaction set during ledger building. The tx.apply span (BuildLedger.cpp:88) wraps applyTransactions() which iterates through the CanonicalTXSet with multiple retry passes. Records xrpl.ledger.tx_count (successful) and xrpl.ledger.tx_failed (failed) as attributes.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))", + "legendFormat": "P95 tx.apply [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))", + "legendFormat": "P50 tx.apply [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Transaction Apply Rate", + "description": "Rate of tx.apply span invocations, reflecting how frequently the transaction application phase runs during ledger building. Each ledger build triggers one tx.apply call. Should closely match the ledger build rate.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m]))", + "legendFormat": "tx.apply / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Operations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Ledger Store Rate", + "description": "Rate at which ledgers are stored into the ledger history. The ledger.store span (LedgerMaster.cpp:409) wraps storeLedger() which inserts the ledger into the LedgerHistory cache. Records xrpl.ledger.seq. Should match the ledger build rate under normal operation.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"ledger.store\"}[5m]))", + "legendFormat": "Stores / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Build vs Close Duration", + "description": "Compares p95 durations of ledger.build (the actual ledger construction in BuildLedger.cpp) vs consensus.ledger_close (the consensus close event in RCLConsensus.cpp). Build time is a subset of close time. A large gap between them indicates overhead in the consensus pipeline outside of ledger construction itself.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"ledger.build\"}[5m])))", + "legendFormat": "P95 ledger.build [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.ledger_close\"}[5m])))", + "legendFormat": "P95 consensus.ledger_close [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "ledger", "telemetry"], + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id \u2014 e.g. Node-1)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total, exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "Ledger Operations", + "uid": "rippled-ledger-ops" +} diff --git a/docker/telemetry/grafana/dashboards/peer-network.json b/docker/telemetry/grafana/dashboards/peer-network.json new file mode 100644 index 0000000000..9740b04366 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/peer-network.json @@ -0,0 +1,227 @@ +{ + "annotations": { + "list": [] + }, + "description": "Requires trace_peer=1 in the [telemetry] config section.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Peer Proposal Receive Rate", + "description": "Rate of consensus proposals received from network peers. The peer.proposal.receive span (PeerImp.cpp:1667) fires in onMessage(TMProposeSet) for each incoming proposal. Records xrpl.peer.id (sending peer) and xrpl.peer.proposal.trusted (whether the proposer is in our UNL). Requires trace_peer=1 in the telemetry config.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"peer.proposal.receive\"}[5m]))", + "legendFormat": "Proposals Received / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Proposals / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Peer Validation Receive Rate", + "description": "Rate of ledger validations received from network peers. The peer.validation.receive span (PeerImp.cpp:2264) fires in onMessage(TMValidation) for each incoming validation message. Records xrpl.peer.id (sending peer) and xrpl.peer.validation.trusted (whether the validator is trusted). Requires trace_peer=1 in the telemetry config.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"peer.validation.receive\"}[5m]))", + "legendFormat": "Validations Received / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Validations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Proposals Trusted vs Untrusted", + "description": "Pie chart showing the ratio of proposals received from trusted validators (in our UNL) vs untrusted validators. Grouped by the xrpl.peer.proposal.trusted span attribute (true/false). A healthy node connected to a well-configured UNL should see a significant portion of trusted proposals. Note: proposals that fail early validation may not have the trusted attribute set.", + "type": "piechart", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (xrpl_peer_proposal_trusted, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_peer_proposal_trusted=~\"$proposal_trusted\", span_name=\"peer.proposal.receive\"}[5m]))", + "legendFormat": "Trusted = {{xrpl_peer_proposal_trusted}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Validations Trusted vs Untrusted", + "description": "Pie chart showing the ratio of validations received from trusted validators (in our UNL) vs untrusted validators. Grouped by the xrpl.peer.validation.trusted span attribute (true/false). Monitoring this helps detect if the node is receiving validations from the expected set of trusted validators. Note: validations that fail early checks may not have the trusted attribute set.", + "type": "piechart", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (xrpl_peer_validation_trusted, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_peer_validation_trusted=~\"$validation_trusted\", span_name=\"peer.validation.receive\"}[5m]))", + "legendFormat": "Trusted = {{xrpl_peer_validation_trusted}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "peer", "telemetry"], + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id \u2014 e.g. Node-1)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total, exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "proposal_trusted", + "label": "Proposal Trusted", + "description": "Filter by proposal trust status (true = from trusted validator)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"peer.proposal.receive\"}, xrpl_peer_proposal_trusted)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "validation_trusted", + "label": "Validation Trusted", + "description": "Filter by validation trust status (true = from trusted validator)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"peer.validation.receive\"}, xrpl_peer_validation_trusted)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "Peer Network", + "uid": "rippled-peer-net" +} diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json index 99cfe82699..dec11c506d 100644 --- a/docker/telemetry/grafana/dashboards/rpc-performance.json +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -10,6 +10,7 @@ "panels": [ { "title": "RPC Request Rate by Command", + "description": "Per-second rate of RPC command executions, broken down by command name (e.g. server_info, submit). Calculated as rate(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}) over a 5m window, grouped by the xrpl.rpc.command span attribute.", "type": "timeseries", "gridPos": { "h": 8, @@ -17,13 +18,19 @@ "x": 0, "y": 0 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", exported_instance=~\"$node\", span_name=~\"rpc.command.*\"}[5m]))", - "legendFormat": "{{xrpl_rpc_command}}" + "expr": "sum by (xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m]))", + "legendFormat": "{{xrpl_rpc_command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -42,6 +49,7 @@ }, { "title": "RPC Latency P95 by Command", + "description": "95th percentile response time for each RPC command. Computed from the spanmetrics duration histogram using histogram_quantile(0.95) over rpc.command.* spans, grouped by xrpl.rpc.command. High values indicate slow commands that may need optimization.", "type": "timeseries", "gridPos": { "h": 8, @@ -49,13 +57,19 @@ "x": 12, "y": 0 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", exported_instance=~\"$node\", span_name=~\"rpc.command.*\"}[5m])))", - "legendFormat": "P95 {{xrpl_rpc_command}}" + "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])))", + "legendFormat": "P95 {{xrpl_rpc_command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -74,6 +88,7 @@ }, { "title": "RPC Error Rate", + "description": "Percentage of RPC commands that completed with an error status, per command. Calculated as (error calls / total calls) * 100, where errors have status_code=STATUS_CODE_ERROR. Thresholds: green < 1%, yellow 1-5%, red > 5%.", "type": "bargauge", "gridPos": { "h": 8, @@ -81,13 +96,19 @@ "x": 0, "y": 8 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", exported_instance=~\"$node\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100", - "legendFormat": "{{xrpl_rpc_command}}" + "expr": "sum by (xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100", + "legendFormat": "{{xrpl_rpc_command}} [{{exported_instance}}]" } ], "fieldConfig": { @@ -115,6 +136,7 @@ }, { "title": "RPC Latency Heatmap", + "description": "Distribution of RPC command response times across histogram buckets. Shows the density of requests at each latency level over time. Each cell represents the count of requests that fell into that duration bucket in a 5m window. Useful for spotting bimodal latency patterns.", "type": "heatmap", "gridPos": { "h": 8, @@ -122,16 +144,181 @@ "x": 12, "y": 8 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "yAxis": { + "axisLabel": "Duration (ms)" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", exported_instance=~\"$node\", span_name=~\"rpc.command.*\"}[5m])) by (le)", + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) by (le)", "legendFormat": "{{le}}", "format": "heatmap" } ] + }, + { + "title": "Overall RPC Throughput", + "description": "Aggregate RPC throughput showing two layers of the request pipeline. rpc.request is the outer HTTP handler (ServerHandler.cpp:271) that accepts incoming connections. rpc.process is the inner processing layer (ServerHandler.cpp:573) that parses and dispatches. A gap between the two indicates requests being queued or rejected before processing.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=\"rpc.request\"}[5m]))", + "legendFormat": "rpc.request / Sec [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=\"rpc.process\"}[5m]))", + "legendFormat": "rpc.process / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "axisLabel": "Requests / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "RPC Success vs Error", + "description": "Aggregate rate of successful vs failed RPC commands across all command types. Success = status_code UNSET (OpenTelemetry default for OK spans). Error = status_code STATUS_CODE_ERROR. A sustained error rate warrants investigation via per-command breakdown above.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_UNSET\"}[5m]))", + "legendFormat": "Success [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m]))", + "legendFormat": "Error [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Commands / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Top Commands by Volume", + "description": "Top 10 most frequently called RPC commands by total invocation count over the last 5 minutes. Uses topk(10, increase(calls_total)) to rank commands. Helps identify the hottest API endpoints driving load on the node.", + "type": "bargauge", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, sum by (xrpl_rpc_command, exported_instance) (increase(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])))", + "legendFormat": "{{xrpl_rpc_command}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + } + }, + { + "title": "WebSocket Message Rate", + "description": "Rate of incoming WebSocket RPC messages processed by the server. Sourced from the rpc.ws_message span (ServerHandler.cpp:384). Only active when clients connect via WebSocket instead of HTTP. Zero is normal if only HTTP RPC is in use.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=\"rpc.ws_message\"}[5m]))", + "legendFormat": "WS Messages / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } } ], "schemaVersion": 39, @@ -141,7 +328,7 @@ { "name": "node", "label": "Node", - "description": "Filter by rippled node (service.instance.id \u2014 e.g. Node-1)", + "description": "Filter by rippled node (service.instance.id — e.g. Node-1)", "type": "query", "query": "label_values(traces_span_metrics_calls_total, exported_instance)", "datasource": { @@ -184,6 +371,6 @@ "from": "now-1h", "to": "now" }, - "title": "rippled RPC Performance", + "title": "RPC Performance", "uid": "rippled-rpc-perf" } diff --git a/docker/telemetry/grafana/dashboards/statsd-ledger-data-sync.json b/docker/telemetry/grafana/dashboards/statsd-ledger-data-sync.json new file mode 100644 index 0000000000..502d78e7aa --- /dev/null +++ b/docker/telemetry/grafana/dashboards/statsd-ledger-data-sync.json @@ -0,0 +1,506 @@ +{ + "annotations": { + "list": [] + }, + "description": "Ledger data exchange and object fetch traffic from beast::insight StatsD. Covers ledger sync, node data retrieval, and transaction set exchange. Requires [insight] server=statsd in rippled config.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Ledger Data Exchange (Bytes In)", + "description": "Inbound bytes for ledger data sub-categories. 'ledger_data' = aggregated ledger data, sub-types include Transaction_Set_candidate (proposed tx sets), Transaction_Node (tx tree nodes), and Account_State_Node (state tree nodes). High Account_State_Node traffic indicates state sync; high Transaction_Set_candidate indicates consensus catch-up. Sourced from TrafficCount.h ledger_data_* categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_data_get_Bytes_In", + "legendFormat": "Ledger Data Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_data_share_Bytes_In", + "legendFormat": "Ledger Data Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_data_Transaction_Set_candidate_get_Bytes_In", + "legendFormat": "TX Set Candidate Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_data_Transaction_Set_candidate_share_Bytes_In", + "legendFormat": "TX Set Candidate Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_data_Transaction_Node_get_Bytes_In", + "legendFormat": "TX Node Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_data_Transaction_Node_share_Bytes_In", + "legendFormat": "TX Node Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_data_Account_State_Node_get_Bytes_In", + "legendFormat": "Account State Node Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_data_Account_State_Node_share_Bytes_In", + "legendFormat": "Account State Node Share" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "custom": { + "axisLabel": "Bytes In", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Ledger Share/Get Traffic (Bytes)", + "description": "Legacy ledger share and get traffic by sub-type. These are the older ledger fetch protocol categories (as opposed to ledger_data_* which is the newer protocol). Sub-types: Transaction_Set_candidate, Transaction_node, Account_State_node, plus aggregate ledger_share and ledger_get. Sourced from TrafficCount.h ledger_* categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_share_Bytes_In", + "legendFormat": "Ledger Share In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_get_Bytes_In", + "legendFormat": "Ledger Get In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_Transaction_Set_candidate_share_Bytes_In", + "legendFormat": "TX Set Candidate Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_Transaction_Set_candidate_get_Bytes_In", + "legendFormat": "TX Set Candidate Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_Transaction_node_share_Bytes_In", + "legendFormat": "TX Node Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_Transaction_node_get_Bytes_In", + "legendFormat": "TX Node Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_Account_State_node_share_Bytes_In", + "legendFormat": "Account State Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ledger_Account_State_node_get_Bytes_In", + "legendFormat": "Account State Get" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "custom": { + "axisLabel": "Bytes In", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "GetObject Traffic by Type (Bytes In)", + "description": "Object fetch traffic by object type. GetObject is the protocol for fetching specific SHAMap nodes. Types: Ledger (full ledger headers), Transaction (individual txs), Transaction_node (tx tree nodes), Account_State_node (state tree nodes), CAS (Content Addressable Storage objects), Fetch_Pack (batch fetch during catch-up), Transactions (bulk tx fetch). High Fetch_Pack traffic indicates a node is catching up. Sourced from TrafficCount.h getobject_* categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Ledger_get_Bytes_In", + "legendFormat": "Ledger Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Ledger_share_Bytes_In", + "legendFormat": "Ledger Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Transaction_get_Bytes_In", + "legendFormat": "Transaction Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Transaction_share_Bytes_In", + "legendFormat": "Transaction Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Transaction_node_get_Bytes_In", + "legendFormat": "TX Node Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Transaction_node_share_Bytes_In", + "legendFormat": "TX Node Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Account_State_node_get_Bytes_In", + "legendFormat": "Account State Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Account_State_node_share_Bytes_In", + "legendFormat": "Account State Share" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "custom": { + "axisLabel": "Bytes In", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "GetObject Aggregate & Special Types (Bytes In)", + "description": "Aggregate getobject traffic plus special categories: CAS (Content Addressable Storage) for SHAMap node fetch, Fetch_Pack for bulk batch downloads during catch-up, Transactions for bulk tx fetch, and the aggregate getobject_get/getobject_share totals. Sourced from TrafficCount.h getobject_* categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_CAS_get_Bytes_In", + "legendFormat": "CAS Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_CAS_share_Bytes_In", + "legendFormat": "CAS Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Fetch_Pack_share_Bytes_In", + "legendFormat": "Fetch Pack Share" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Fetch_Pack_get_Bytes_In", + "legendFormat": "Fetch Pack Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Transactions_get_Bytes_In", + "legendFormat": "Transactions Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_get_Bytes_In", + "legendFormat": "Aggregate Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_share_Bytes_In", + "legendFormat": "Aggregate Share" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "custom": { + "axisLabel": "Bytes In", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "GetObject Messages by Type", + "description": "Message counts for object fetch operations. Shows how many individual fetch requests and responses are exchanged per type. High message counts with low byte counts indicate small object fetches; the inverse indicates large batch transfers. Sourced from TrafficCount.h getobject_* categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Ledger_get_Messages_In", + "legendFormat": "Ledger Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Transaction_get_Messages_In", + "legendFormat": "Transaction Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Transaction_node_get_Messages_In", + "legendFormat": "TX Node Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Account_State_node_get_Messages_In", + "legendFormat": "Account State Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_CAS_get_Messages_In", + "legendFormat": "CAS Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Fetch_Pack_get_Messages_In", + "legendFormat": "Fetch Pack Get" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_getobject_Transactions_get_Messages_In", + "legendFormat": "Transactions Get" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Messages In", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Overlay Traffic Heatmap (All Categories, Bytes In)", + "description": "Bar gauge showing all overlay traffic categories ranked by inbound bytes. Provides a complete at-a-glance view of which protocol message types consume the most bandwidth across all 57+ traffic categories. Sourced from all TrafficCount.h categories via wildcard match.", + "type": "bargauge", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(20, {__name__=~\"rippled_.*_Bytes_In\", __name__!~\"rippled_total_.*\"})", + "legendFormat": "{{__name__}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1048576 + }, + { + "color": "red", + "value": 104857600 + } + ] + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "statsd", "ledger", "sync", "telemetry"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "Ledger Data & Sync (StatsD)", + "uid": "rippled-statsd-ledger-sync" +} diff --git a/docker/telemetry/grafana/dashboards/statsd-network-traffic.json b/docker/telemetry/grafana/dashboards/statsd-network-traffic.json new file mode 100644 index 0000000000..8dc072ba23 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/statsd-network-traffic.json @@ -0,0 +1,671 @@ +{ + "annotations": { + "list": [] + }, + "description": "Network traffic and peer metrics from beast::insight StatsD. Requires [insight] server=statsd in rippled config.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Active Peers", + "description": "Number of active inbound and outbound peer connections. Sourced from Peer_Finder.Active_Inbound_Peers and Peer_Finder.Active_Outbound_Peers gauges (PeerfinderManager.cpp:214-215). A healthy mainnet node typically has 10-21 outbound and 0-85 inbound peers depending on configuration.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_Peer_Finder_Active_Inbound_Peers", + "legendFormat": "Inbound Peers" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_Peer_Finder_Active_Outbound_Peers", + "legendFormat": "Outbound Peers" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Peers", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Peer Disconnects", + "description": "Cumulative count of peer disconnections. Sourced from the Overlay.Peer_Disconnects gauge (OverlayImpl.h:557). A rising trend indicates network instability, aggressive peer management, or resource exhaustion causing connection drops.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_Overlay_Peer_Disconnects", + "legendFormat": "Disconnects" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Disconnects", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Total Network Bytes", + "description": "Total bytes sent and received across all peer connections. Sourced from the total.Bytes_In and total.Bytes_Out traffic category gauges (OverlayImpl.h:535-548). Provides a high-level view of network bandwidth consumption.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_total_Bytes_In", + "legendFormat": "Bytes In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_total_Bytes_Out", + "legendFormat": "Bytes Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "custom": { + "axisLabel": "Bytes", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Total Network Messages", + "description": "Total messages sent and received across all peer connections. Sourced from the total.Messages_In and total.Messages_Out traffic category gauges (OverlayImpl.h:535-548). Shows the overall message throughput of the overlay network.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_total_Messages_In", + "legendFormat": "Messages In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_total_Messages_Out", + "legendFormat": "Messages Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Messages", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Transaction Traffic", + "description": "Bytes and messages for transaction-related overlay traffic. Includes the transactions traffic category (OverlayImpl/TrafficCount.h). Spikes indicate high transaction volume on the network or transaction flooding.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_transactions_Messages_In", + "legendFormat": "TX Messages In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_transactions_Messages_Out", + "legendFormat": "TX Messages Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_transactions_duplicate_Messages_In", + "legendFormat": "TX Duplicate In" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Messages", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Proposal Traffic", + "description": "Messages for consensus proposal overlay traffic. Includes proposals, proposals_untrusted, and proposals_duplicate categories (TrafficCount.h). High untrusted or duplicate counts may indicate UNL misconfiguration or network spam.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_proposals_Messages_In", + "legendFormat": "Proposals In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_proposals_Messages_Out", + "legendFormat": "Proposals Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_proposals_untrusted_Messages_In", + "legendFormat": "Untrusted In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_proposals_duplicate_Messages_In", + "legendFormat": "Duplicate In" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Messages", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Validation Traffic", + "description": "Messages for validation overlay traffic. Includes validations, validations_untrusted, and validations_duplicate categories (TrafficCount.h). Monitoring trusted vs untrusted validation traffic helps detect UNL health issues.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_validations_Messages_In", + "legendFormat": "Validations In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_validations_Messages_Out", + "legendFormat": "Validations Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_validations_untrusted_Messages_In", + "legendFormat": "Untrusted In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_validations_duplicate_Messages_In", + "legendFormat": "Duplicate In" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Messages", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Overlay Traffic by Category (Bytes In)", + "description": "Top traffic categories by inbound bytes. Includes all 57 overlay traffic categories from TrafficCount.h. Shows which protocol message types consume the most bandwidth. Categories include transactions, proposals, validations, ledger data, getobject, and overlay overhead.", + "type": "bargauge", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, {__name__=~\"rippled_.*_Bytes_In\", __name__!~\"rippled_total_.*\"})", + "legendFormat": "{{__name__}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rippled_transactions_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Transactions" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_proposals_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Proposals" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_validations_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Validations" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_overhead_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Overhead" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_overhead_overlay_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Overhead Overlay" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ping_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Ping" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_status_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Status" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_getObject_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Get Object" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_haveTxSet_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Have Tx Set" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledgerData_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Ledger Data" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_share_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Ledger Share" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_data_get_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Ledger Data Get" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_data_share_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Ledger Data Share" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_data_Account_State_Node_get_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Account State Node Get" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_data_Account_State_Node_share_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Account State Node Share" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_data_Transaction_Node_get_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Transaction Node Get" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_data_Transaction_Node_share_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Transaction Node Share" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_data_Transaction_Set_candidate_get_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Tx Set Candidate Get" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_Account_State_node_share_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Account State Node Share (Legacy)" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_Transaction_Set_candidate_share_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Tx Set Candidate Share" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_ledger_Transaction_node_share_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Transaction Node Share (Legacy)" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rippled_set_get_Bytes_In" + }, + "properties": [ + { + "id": "displayName", + "value": "Set Get" + } + ] + } + ] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "statsd", "network", "telemetry"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "Network Traffic (StatsD)", + "uid": "rippled-statsd-network" +} diff --git a/docker/telemetry/grafana/dashboards/statsd-node-health.json b/docker/telemetry/grafana/dashboards/statsd-node-health.json new file mode 100644 index 0000000000..215187f382 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/statsd-node-health.json @@ -0,0 +1,415 @@ +{ + "annotations": { + "list": [] + }, + "description": "Node health metrics from beast::insight StatsD. Requires [insight] server=statsd in rippled config.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Validated Ledger Age", + "description": "Age of the most recently validated ledger in seconds. Sourced from the LedgerMaster.Validated_Ledger_Age gauge (LedgerMaster.h:373) which is updated every collection interval via the insight hook. Values above 20s indicate the node is falling behind the network.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_LedgerMaster_Validated_Ledger_Age", + "legendFormat": "Validated Age" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 20 + } + ] + } + }, + "overrides": [] + } + }, + { + "title": "Published Ledger Age", + "description": "Age of the most recently published ledger in seconds. Sourced from the LedgerMaster.Published_Ledger_Age gauge (LedgerMaster.h:374). Published ledger age should track close to validated ledger age. A growing gap indicates publish pipeline backlog.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_LedgerMaster_Published_Ledger_Age", + "legendFormat": "Published Age" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 20 + } + ] + } + }, + "overrides": [] + } + }, + { + "title": "Operating Mode Duration", + "description": "Cumulative time spent in each operating mode (Disconnected, Connected, Syncing, Tracking, Full). Sourced from State_Accounting.*_duration gauges (NetworkOPs.cpp:774-778). A healthy node should spend the vast majority of time in Full mode.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_State_Accounting_Full_duration", + "legendFormat": "Full" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_State_Accounting_Tracking_duration", + "legendFormat": "Tracking" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_State_Accounting_Syncing_duration", + "legendFormat": "Syncing" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_State_Accounting_Connected_duration", + "legendFormat": "Connected" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_State_Accounting_Disconnected_duration", + "legendFormat": "Disconnected" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "axisLabel": "Duration (Sec)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Operating Mode Transitions", + "description": "Count of transitions into each operating mode. Sourced from State_Accounting.*_transitions gauges (NetworkOPs.cpp:780-786). Frequent transitions out of Full mode indicate instability. Transitions to Disconnected or Syncing warrant investigation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_State_Accounting_Full_transitions", + "legendFormat": "Full" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_State_Accounting_Tracking_transitions", + "legendFormat": "Tracking" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_State_Accounting_Syncing_transitions", + "legendFormat": "Syncing" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_State_Accounting_Connected_transitions", + "legendFormat": "Connected" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_State_Accounting_Disconnected_transitions", + "legendFormat": "Disconnected" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Transitions", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "I/O Latency", + "description": "P95 and P50 of the I/O service loop latency in milliseconds. Sourced from the ios_latency event (Application.cpp:438) which measures how long it takes for the io_context to process a timer callback. Values above 10ms are logged; above 500ms trigger warnings. High values indicate thread pool saturation or blocking operations.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ios_latency{quantile=\"0.95\"}", + "legendFormat": "P95 I/O Latency" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_ios_latency{quantile=\"0.5\"}", + "legendFormat": "P50 I/O Latency" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Latency (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Job Queue Depth", + "description": "Current number of jobs waiting in the job queue. Sourced from the job_count gauge (JobQueue.cpp:26). A sustained high value indicates the node cannot process work fast enough \u2014 common during ledger replay or heavy RPC load.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_job_count", + "legendFormat": "Job Queue Depth" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Jobs", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Ledger Fetch Rate", + "description": "Rate of ledger fetch requests initiated by the node. Sourced from the ledger_fetches counter (InboundLedgers.cpp:44) which increments each time the node requests a ledger from a peer. High rates indicate the node is catching up or missing ledgers.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_ledger_fetches_total[5m])", + "legendFormat": "Fetches / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Ledger History Mismatches", + "description": "Rate of ledger history hash mismatches. Sourced from the ledger.history.mismatch counter (LedgerHistory.cpp:16) which increments when a built ledger hash does not match the expected validated hash. Non-zero values indicate consensus divergence or database corruption.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_ledger_history_mismatch_total[5m])", + "legendFormat": "Mismatches / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "statsd", "node-health", "telemetry"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "Node Health (StatsD)", + "uid": "rippled-statsd-node-health" +} diff --git a/docker/telemetry/grafana/dashboards/statsd-overlay-traffic-detail.json b/docker/telemetry/grafana/dashboards/statsd-overlay-traffic-detail.json new file mode 100644 index 0000000000..a09a2b5d17 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/statsd-overlay-traffic-detail.json @@ -0,0 +1,566 @@ +{ + "annotations": { + "list": [] + }, + "description": "Detailed overlay traffic breakdown for categories not covered by the main Network Traffic dashboard. Includes squelch, overhead, validator lists, object fetch, ledger sync, and protocol negotiation traffic. Requires [insight] server=statsd in rippled config.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Squelch Traffic (Messages)", + "description": "Squelch-related overlay messages. Squelch is the peer traffic management protocol that suppresses redundant message forwarding. 'squelch' = squelch control messages, 'squelch_suppressed' = messages suppressed by squelch, 'squelch_ignored' = squelch directives that were ignored. High suppressed counts indicate effective bandwidth savings; high ignored counts may indicate misconfigured peers. Sourced from TrafficCount.h squelch categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_squelch_Messages_In", + "legendFormat": "Squelch In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_squelch_Messages_Out", + "legendFormat": "Squelch Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_squelch_suppressed_Messages_In", + "legendFormat": "Suppressed In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_squelch_suppressed_Messages_Out", + "legendFormat": "Suppressed Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_squelch_ignored_Messages_In", + "legendFormat": "Ignored In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_squelch_ignored_Messages_Out", + "legendFormat": "Ignored Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Messages", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Overhead Traffic Breakdown (Bytes)", + "description": "Overlay protocol overhead by sub-category. 'overhead' = base protocol overhead (ping, status, etc.), 'overhead_cluster' = intra-cluster communication overhead, 'overhead_manifest' = validator manifest distribution overhead. High cluster overhead may indicate frequent cluster state syncs; high manifest overhead occurs during UNL changes. Sourced from TrafficCount.h overhead categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_overhead_Bytes_In", + "legendFormat": "Base Overhead In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_overhead_Bytes_Out", + "legendFormat": "Base Overhead Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_overhead_cluster_Bytes_In", + "legendFormat": "Cluster In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_overhead_cluster_Bytes_Out", + "legendFormat": "Cluster Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_overhead_manifest_Bytes_In", + "legendFormat": "Manifest In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_overhead_manifest_Bytes_Out", + "legendFormat": "Manifest Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "custom": { + "axisLabel": "Bytes", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Validator List Traffic", + "description": "Validator list (UNL) distribution traffic. Validator lists are exchanged when peers share their trusted validator configurations. Spikes occur during UNL updates or when new peers connect. Sourced from TrafficCount.h validator_lists category.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_validator_lists_Bytes_In", + "legendFormat": "Bytes In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_validator_lists_Bytes_Out", + "legendFormat": "Bytes Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_validator_lists_Messages_In", + "legendFormat": "Messages In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_validator_lists_Messages_Out", + "legendFormat": "Messages Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Count", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/Bytes/" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "decbytes" + } + ] + } + ] + } + }, + { + "title": "Set Get/Share Traffic (Bytes)", + "description": "Transaction set get and share traffic. 'set_get' = requests to fetch transaction sets (sent during ledger close), 'set_share' = responses sharing transaction sets. High set_get traffic indicates peers frequently requesting missing transaction sets, which may signal sync delays. Sourced from TrafficCount.h set_get/set_share categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_set_get_Bytes_In", + "legendFormat": "Set Get In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_set_get_Bytes_Out", + "legendFormat": "Set Get Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_set_share_Bytes_In", + "legendFormat": "Set Share In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_set_share_Bytes_Out", + "legendFormat": "Set Share Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "custom": { + "axisLabel": "Bytes", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Have/Requested Transactions (Messages)", + "description": "Transaction availability protocol messages. 'have_transactions' = advertisements that a peer has specific transactions available, 'requested_transactions' = explicit requests for transaction data. A high ratio of requested to have may indicate peers are behind on transaction propagation. Sourced from TrafficCount.h have_transactions/requested_transactions categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_have_transactions_Messages_In", + "legendFormat": "Have TX In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_have_transactions_Messages_Out", + "legendFormat": "Have TX Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_requested_transactions_Messages_In", + "legendFormat": "Requested TX In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_requested_transactions_Messages_Out", + "legendFormat": "Requested TX Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Messages", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Unknown / Unclassified Traffic", + "description": "Traffic that does not match any known overlay message category. Non-zero values may indicate protocol version mismatches, corrupted messages, or new message types not yet classified. Sourced from TrafficCount.h unknown category.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_unknown_Bytes_In", + "legendFormat": "Unknown Bytes In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_unknown_Bytes_Out", + "legendFormat": "Unknown Bytes Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_unknown_Messages_In", + "legendFormat": "Unknown Messages In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_unknown_Messages_Out", + "legendFormat": "Unknown Messages Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "axisLabel": "Count", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/Bytes/" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "decbytes" + } + ] + } + ] + } + }, + { + "title": "Proof Path Traffic", + "description": "Proof path request/response traffic for ledger state proof exchange. Used by peers to verify specific ledger entries without downloading the full ledger. High request volume may indicate peers validating state during catch-up. Sourced from TrafficCount.h proof_path_request/proof_path_response categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_proof_path_request_Bytes_In", + "legendFormat": "Request Bytes In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_proof_path_request_Bytes_Out", + "legendFormat": "Request Bytes Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_proof_path_response_Bytes_In", + "legendFormat": "Response Bytes In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_proof_path_response_Bytes_Out", + "legendFormat": "Response Bytes Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "custom": { + "axisLabel": "Bytes", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Replay Delta Traffic", + "description": "Replay delta request/response traffic for ledger replay protocol. Used during catch-up to efficiently replay ledger state changes. Sourced from TrafficCount.h replay_delta_request/replay_delta_response categories.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_replay_delta_request_Bytes_In", + "legendFormat": "Request Bytes In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_replay_delta_request_Bytes_Out", + "legendFormat": "Request Bytes Out" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_replay_delta_response_Bytes_In", + "legendFormat": "Response Bytes In" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_replay_delta_response_Bytes_Out", + "legendFormat": "Response Bytes Out" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "custom": { + "axisLabel": "Bytes", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "statsd", "overlay", "network", "telemetry"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "Overlay Traffic Detail (StatsD)", + "uid": "rippled-statsd-overlay-detail" +} diff --git a/docker/telemetry/grafana/dashboards/statsd-rpc-pathfinding.json b/docker/telemetry/grafana/dashboards/statsd-rpc-pathfinding.json new file mode 100644 index 0000000000..10bf1575e3 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/statsd-rpc-pathfinding.json @@ -0,0 +1,396 @@ +{ + "annotations": { + "list": [] + }, + "description": "RPC and pathfinding metrics from beast::insight StatsD. Requires [insight] server=statsd in rippled config.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "RPC Request Rate (StatsD)", + "description": "Rate of RPC requests as counted by the beast::insight counter. Sourced from rpc.requests (ServerHandler.cpp:108) which increments on every HTTP and WebSocket RPC request. Compare with the span-based rpc.request rate in the RPC Performance dashboard for cross-validation.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_rpc_requests_total[5m])", + "legendFormat": "Requests / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + } + }, + { + "title": "RPC Response Time (StatsD)", + "description": "P95 and P50 of RPC response time from the beast::insight timer. Sourced from the rpc.time event (ServerHandler.cpp:110) which records elapsed milliseconds for each RPC response. This measures the full HTTP handler time, not just command execution. Compare with span-based rpc.request duration.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_rpc_time{quantile=\"0.95\"}", + "legendFormat": "P95 Response Time" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_rpc_time{quantile=\"0.5\"}", + "legendFormat": "P50 Response Time" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Latency (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "RPC Response Size", + "description": "P95 and P50 of RPC response payload size in bytes. Sourced from the rpc.size event (ServerHandler.cpp:109) which records the byte length of each RPC JSON response. Large responses may indicate expensive queries (e.g. account_tx with many results) or API misuse.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_rpc_size{quantile=\"0.95\"}", + "legendFormat": "P95 Response Size" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_rpc_size{quantile=\"0.5\"}", + "legendFormat": "P50 Response Size" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "custom": { + "axisLabel": "Size (Bytes)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "RPC Response Time Distribution", + "description": "Distribution of RPC response times from the beast::insight timer showing P50, P90, P95, and P99 quantiles. Sourced from the rpc.time event (ServerHandler.cpp:110). Useful for detecting bimodal latency or long-tail requests.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_rpc_time{quantile=\"0.5\"}", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_rpc_time{quantile=\"0.9\"}", + "legendFormat": "P90" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_rpc_time{quantile=\"0.95\"}", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_rpc_time{quantile=\"0.99\"}", + "legendFormat": "P99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Latency (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Pathfinding Fast Duration", + "description": "P95 and P50 of fast pathfinding execution time. Sourced from the pathfind_fast event (PathRequests.h:23) which records the duration of the fast pathfinding algorithm. Fast pathfinding uses a simplified search that trades accuracy for speed.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_pathfind_fast{quantile=\"0.95\"}", + "legendFormat": "P95 Fast Pathfind" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_pathfind_fast{quantile=\"0.5\"}", + "legendFormat": "P50 Fast Pathfind" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Pathfinding Full Duration", + "description": "P95 and P50 of full pathfinding execution time. Sourced from the pathfind_full event (PathRequests.h:24) which records the duration of the exhaustive pathfinding search. Full pathfinding is more expensive and can take significantly longer than fast mode.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_pathfind_full{quantile=\"0.95\"}", + "legendFormat": "P95 Full Pathfind" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_pathfind_full{quantile=\"0.5\"}", + "legendFormat": "P50 Full Pathfind" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Resource Warnings Rate", + "description": "Rate of resource warning events from the Resource Manager. Sourced from the warn meter (Logic.h:33) which increments when a consumer (peer or RPC client) exceeds the warning threshold for resource usage. A rising rate indicates aggressive clients that may need throttling. NOTE: This panel will show no data until the |m -> |c fix is applied in StatsDCollector.cpp:706 (Phase 6 Task 6.1).", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_warn_total[5m])", + "legendFormat": "Warnings / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + } + }, + { + "title": "Resource Drops Rate", + "description": "Rate of resource drop events from the Resource Manager. Sourced from the drop meter (Logic.h:34) which increments when a consumer is disconnected or blocked due to excessive resource usage. Non-zero values mean the node is actively rejecting abusive connections. NOTE: This panel will show no data until the |m -> |c fix is applied in StatsDCollector.cpp:706 (Phase 6 Task 6.1).", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rate(rippled_drop_total[5m])", + "legendFormat": "Drops / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.01 + }, + { + "color": "red", + "value": 0.1 + } + ] + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "statsd", "rpc", "pathfinding", "telemetry"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "RPC & Pathfinding (StatsD)", + "uid": "rippled-statsd-rpc" +} diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json index b5f008972f..d233110ce0 100644 --- a/docker/telemetry/grafana/dashboards/transaction-overview.json +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -10,6 +10,7 @@ "panels": [ { "title": "Transaction Processing Rate", + "description": "Rate of transactions entering the processing pipeline. tx.process (NetworkOPs.cpp:1227) fires when a transaction is submitted locally or received from a peer and enters processTransaction(). tx.receive (PeerImp.cpp:1273) fires when a raw transaction message arrives from a peer before deduplication.", "type": "timeseries", "gridPos": { "h": 8, @@ -17,31 +18,45 @@ "x": 0, "y": 0 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\"}[5m]))", - "legendFormat": "tx.process/sec" + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\"}[5m]))", + "legendFormat": "tx.process / Sec [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.receive\"}[5m]))", - "legendFormat": "tx.receive/sec" + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.receive\"}[5m]))", + "legendFormat": "tx.receive / Sec [{{exported_instance}}]" } ], "fieldConfig": { "defaults": { - "unit": "ops" + "unit": "ops", + "custom": { + "axisLabel": "Transactions / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "Transaction Processing Latency", + "description": "p95 and p50 latency of transaction processing (tx.process span). Measures the time from when a transaction enters processTransaction() to completion. Computed via histogram_quantile() over the spanmetrics duration histogram with a 5m rate window.", "type": "timeseries", "gridPos": { "h": 8, @@ -49,31 +64,45 @@ "x": 12, "y": 0 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\"}[5m])))", - "legendFormat": "p95" + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\"}[5m])))", + "legendFormat": "P95 [{{exported_instance}}]" }, { "datasource": { "type": "prometheus" }, - "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\"}[5m])))", - "legendFormat": "p50" + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\"}[5m])))", + "legendFormat": "P50 [{{exported_instance}}]" } ], "fieldConfig": { "defaults": { - "unit": "ms" + "unit": "ms", + "custom": { + "axisLabel": "Latency (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } }, "overrides": [] } }, { "title": "Transaction Path Distribution", + "description": "Breakdown of transactions by origin path. The xrpl.tx.local attribute indicates whether the transaction was submitted locally (true) or received from a peer (false). Helps understand the ratio of locally-originated vs relayed transactions.", "type": "piechart", "gridPos": { "h": 8, @@ -81,18 +110,25 @@ "x": 0, "y": 8 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_tx_local=~\"$tx_origin\", span_name=\"tx.process\"}[5m]))", - "legendFormat": "local={{xrpl_tx_local}}" + "expr": "sum by (xrpl_tx_local, exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_tx_local=~\"$tx_origin\", span_name=\"tx.process\"}[5m]))", + "legendFormat": "Local = {{xrpl_tx_local}} [{{exported_instance}}]" } ] }, { "title": "Transaction Receive vs Suppressed", + "description": "Total rate of raw transaction messages received from peers (tx.receive span from PeerImp.cpp:1273). This fires before deduplication via the HashRouter, so the difference between tx.receive and tx.process reflects suppressed duplicate transactions.", "type": "timeseries", "gridPos": { "h": 8, @@ -100,18 +136,194 @@ "x": 12, "y": 8 }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "targets": [ { "datasource": { "type": "prometheus" }, - "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.receive\"}[5m]))", - "legendFormat": "total received" + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.receive\"}[5m]))", + "legendFormat": "Total Received [{{exported_instance}}]" } ], "fieldConfig": { "defaults": { - "unit": "ops" + "unit": "ops", + "custom": { + "axisLabel": "Transactions / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Transaction Processing Duration Heatmap", + "description": "Heatmap showing the distribution of tx.process span durations across histogram buckets over time. Each cell represents the count of transactions that completed within that latency bucket in a 5m window. Reveals whether processing times are consistent or exhibit multi-modal patterns.", + "type": "heatmap", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "yAxis": { + "axisLabel": "Duration (ms)" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\"}[5m])) by (le)", + "legendFormat": "{{le}}", + "format": "heatmap" + } + ] + }, + { + "title": "Transaction Apply Duration per Ledger", + "description": "p95 and p50 latency of applying the consensus transaction set to a new ledger. The tx.apply span (BuildLedger.cpp:88) wraps the applyTransactions() function that iterates through the CanonicalTXSet and applies each transaction to the OpenView. Long durations indicate heavy transaction sets or expensive transaction processing.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))", + "legendFormat": "P95 tx.apply [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.apply\"}[5m])))", + "legendFormat": "P50 tx.apply [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Latency (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Peer Transaction Receive Rate", + "description": "Rate of transaction messages received from network peers. Sourced from the tx.receive span (PeerImp.cpp:1273) which fires in the onMessage(TMTransaction) handler. High rates may indicate network-wide transaction volume spikes or peer flooding.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.receive\"}[5m]))", + "legendFormat": "tx.receive / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Transactions / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Transaction Apply Failed Rate", + "description": "Rate of tx.apply spans completing with error status, indicating transaction application failures during ledger building. The span records xrpl.ledger.tx_failed as an attribute. Thresholds: green < 0.1/sec, yellow 0.1-1/sec, red > 1/sec. Some failures are normal (e.g. conflicting offers) but sustained high rates may indicate issues.", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.apply\", status_code=\"STATUS_CODE_ERROR\"}[5m]))", + "legendFormat": "Failed / Sec [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 1 + } + ] + } }, "overrides": [] } @@ -124,7 +336,7 @@ { "name": "node", "label": "Node", - "description": "Filter by rippled node (service.instance.id \u2014 e.g. Node-1)", + "description": "Filter by rippled node (service.instance.id — e.g. Node-1)", "type": "query", "query": "label_values(traces_span_metrics_calls_total, exported_instance)", "datasource": { @@ -167,6 +379,6 @@ "from": "now-1h", "to": "now" }, - "title": "rippled Transaction Overview", + "title": "Transaction Overview", "uid": "rippled-transactions" } diff --git a/docker/telemetry/integration-test.sh b/docker/telemetry/integration-test.sh index 1a48aa324a..047b7920fc 100755 --- a/docker/telemetry/integration-test.sh +++ b/docker/telemetry/integration-test.sh @@ -310,9 +310,14 @@ max_queue_size=2048 trace_rpc=1 trace_transactions=1 trace_consensus=1 -trace_peer=0 +trace_peer=1 trace_ledger=1 +[insight] +server=statsd +address=127.0.0.1:8125 +prefix=rippled + [rpc_startup] { "command": "log_level", "severity": "warning" } @@ -485,6 +490,7 @@ log "" log "--- Phase 3: Transaction Spans ---" check_span "tx.process" check_span "tx.receive" +check_span "tx.apply" log "" log "--- Phase 4: Consensus Spans ---" @@ -493,6 +499,17 @@ check_span "consensus.ledger_close" check_span "consensus.accept" check_span "consensus.validation.send" +log "" +log "--- Phase 5: Ledger Spans ---" +check_span "ledger.build" +check_span "ledger.validate" +check_span "ledger.store" + +log "" +log "--- Phase 5: Peer Spans (trace_peer=1) ---" +check_span "peer.proposal.receive" +check_span "peer.validation.receive" + # --------------------------------------------------------------------------- # Step 10: Verify Prometheus spanmetrics # --------------------------------------------------------------------------- @@ -524,6 +541,44 @@ else fail "Grafana: not reachable at localhost:3000" fi +# --------------------------------------------------------------------------- +# Step 10b: Verify StatsD metrics in Prometheus +# --------------------------------------------------------------------------- +log "" +log "--- Phase 6: StatsD Metrics (beast::insight) ---" +log "Waiting 20s for StatsD aggregation + Prometheus scrape..." +sleep 20 + +check_statsd_metric() { + local metric_name="$1" + local result + result=$(curl -sf "$PROM/api/v1/query?query=$metric_name" \ + | jq '.data.result | length' 2>/dev/null || echo 0) + if [ "$result" -gt 0 ]; then + ok "StatsD: $metric_name ($result series)" + else + fail "StatsD: $metric_name (0 series)" + fi +} + +# Node health gauges +check_statsd_metric "rippled_LedgerMaster_Validated_Ledger_Age" +check_statsd_metric "rippled_LedgerMaster_Published_Ledger_Age" +check_statsd_metric "rippled_job_count" + +# State accounting +check_statsd_metric "rippled_State_Accounting_Full_duration" + +# Peer finder +check_statsd_metric "rippled_Peer_Finder_Active_Inbound_Peers" +check_statsd_metric "rippled_Peer_Finder_Active_Outbound_Peers" + +# RPC counters (only if RPC was exercised — should be true from Steps 5-8) +check_statsd_metric "rippled_rpc_requests" + +# Overlay traffic +check_statsd_metric "rippled_total_Bytes_In" + # --------------------------------------------------------------------------- # Step 11: Summary # --------------------------------------------------------------------------- diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index d3b97ae00c..92636688d4 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -35,6 +35,8 @@ connectors: - name: xrpl.rpc.status - name: xrpl.consensus.mode - name: xrpl.tx.local + - name: xrpl.peer.proposal.trusted + - name: xrpl.peer.validation.trusted exporters: debug: diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index 532c3a4d5a..506431b59a 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -62,19 +62,20 @@ All spans instrumented in xrpld, grouped by subsystem: ### RPC Spans (Phase 2) -| Span Name | Source File | Attributes | Description | -| -------------------- | --------------------- | ------------------------------------------------------- | -------------------------------------------------- | -| `rpc.request` | ServerHandler.cpp:271 | — | Top-level HTTP RPC request | -| `rpc.process` | ServerHandler.cpp:573 | — | RPC processing (child of rpc.request) | -| `rpc.ws_message` | ServerHandler.cpp:384 | — | WebSocket RPC message | -| `rpc.command.` | RPCHandler.cpp:161 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role` | Per-command span (e.g., `rpc.command.server_info`) | +| Span Name | Source File | Attributes | Description | +| -------------------- | --------------------- | ---------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------- | +| `rpc.request` | ServerHandler.cpp:271 | — | Top-level HTTP RPC request | +| `rpc.process` | ServerHandler.cpp:573 | — | RPC processing (child of rpc.request) | +| `rpc.ws_message` | ServerHandler.cpp:384 | — | WebSocket RPC message | +| `rpc.command.` | RPCHandler.cpp:161 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.status`, `xrpl.rpc.duration_ms`, `xrpl.rpc.error_message` | Per-command span (e.g., `rpc.command.server_info`) | ### Transaction Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------ | ------------------- | ----------------------------------------------- | ------------------------------------- | -| `tx.process` | NetworkOPs.cpp:1227 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Transaction submission and processing | -| `tx.receive` | PeerImp.cpp:1273 | `xrpl.peer.id` | Transaction received from peer relay | +| Span Name | Source File | Attributes | Description | +| ------------ | ------------------- | ---------------------------------------------------------------------- | ------------------------------------- | +| `tx.process` | NetworkOPs.cpp:1227 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Transaction submission and processing | +| `tx.receive` | PeerImp.cpp:1273 | `xrpl.peer.id`, `xrpl.tx.hash`, `xrpl.tx.suppressed`, `xrpl.tx.status` | Transaction received from peer relay | +| `tx.apply` | BuildLedger.cpp:88 | `xrpl.ledger.seq`, `xrpl.ledger.tx_count`, `xrpl.ledger.tx_failed` | Transaction set applied per ledger | ### Consensus Spans (Phase 4) @@ -102,6 +103,21 @@ All spans instrumented in xrpld, grouped by subsystem: {name="consensus.accept.apply"} | xrpl.consensus.ledger.seq = 92345678 ``` +### Ledger Spans (Phase 5) + +| Span Name | Source File | Attributes | Description | +| ----------------- | -------------------- | ------------------------------------------------------------------ | ----------------------------- | +| `ledger.build` | BuildLedger.cpp:31 | `xrpl.ledger.seq`, `xrpl.ledger.tx_count`, `xrpl.ledger.tx_failed` | Ledger build during consensus | +| `ledger.validate` | LedgerMaster.cpp:915 | `xrpl.ledger.seq`, `xrpl.ledger.validations` | Ledger promoted to validated | +| `ledger.store` | LedgerMaster.cpp:409 | `xrpl.ledger.seq` | Ledger stored in history | + +### Peer Spans (Phase 5) + +| Span Name | Source File | Attributes | Description | +| ------------------------- | ---------------- | ---------------------------------------------- | ----------------------------- | +| `peer.proposal.receive` | PeerImp.cpp:1667 | `xrpl.peer.id`, `xrpl.peer.proposal.trusted` | Proposal received from peer | +| `peer.validation.receive` | PeerImp.cpp:2264 | `xrpl.peer.id`, `xrpl.peer.validation.trusted` | Validation received from peer | + ## Prometheus Metrics (Spanmetrics) The OTel Collector's spanmetrics connector automatically derives RED (Rate, Errors, Duration) metrics from every span. No custom metrics code is needed in xrpld. @@ -128,12 +144,14 @@ Every metric carries these standard labels: Additionally, span attributes configured as dimensions in the collector become metric labels (dots → underscores): -| Span Attribute | Metric Label | Applies To | -| --------------------- | --------------------- | ------------------------------ | -| `xrpl.rpc.command` | `xrpl_rpc_command` | `rpc.command.*` spans | -| `xrpl.rpc.status` | `xrpl_rpc_status` | `rpc.command.*` spans | -| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` spans | -| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` spans | +| Span Attribute | Metric Label | Applies To | +| ------------------------------ | ------------------------------ | ------------------------------- | +| `xrpl.rpc.command` | `xrpl_rpc_command` | `rpc.command.*` spans | +| `xrpl.rpc.status` | `xrpl_rpc_status` | `rpc.command.*` spans | +| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` spans | +| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` spans | +| `xrpl.peer.proposal.trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` spans | +| `xrpl.peer.validation.trusted` | `xrpl_peer_validation_trusted` | `peer.validation.receive` spans | ### Histogram Buckets @@ -143,9 +161,63 @@ Configured in `otel-collector-config.yaml`: 1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s ``` +## StatsD Metrics (beast::insight) + +rippled has a built-in metrics framework (`beast::insight`) that emits StatsD-format metrics over UDP. These complement the span-derived RED metrics by providing system-level gauges, counters, and timers that don't map to individual trace spans. + +### Configuration + +Add to `xrpld.cfg`: + +```ini +[insight] +server=statsd +address=127.0.0.1:8125 +prefix=rippled +``` + +The OTel Collector receives these via a `statsd` receiver on UDP port 8125 and exports them to Prometheus alongside spanmetrics. + +### Metric Reference + +#### Gauges + +| Prometheus Metric | Source | Description | +| --------------------------------------------- | ------------------------- | -------------------------------------------------------------------------- | +| `rippled_LedgerMaster_Validated_Ledger_Age` | LedgerMaster.h:373 | Age of validated ledger (seconds) | +| `rippled_LedgerMaster_Published_Ledger_Age` | LedgerMaster.h:374 | Age of published ledger (seconds) | +| `rippled_State_Accounting_{Mode}_duration` | NetworkOPs.cpp:774 | Time in each operating mode (Disconnected/Connected/Syncing/Tracking/Full) | +| `rippled_State_Accounting_{Mode}_transitions` | NetworkOPs.cpp:780 | Transition count per mode | +| `rippled_Peer_Finder_Active_Inbound_Peers` | PeerfinderManager.cpp:214 | Active inbound peer connections | +| `rippled_Peer_Finder_Active_Outbound_Peers` | PeerfinderManager.cpp:215 | Active outbound peer connections | +| `rippled_Overlay_Peer_Disconnects` | OverlayImpl.h:557 | Peer disconnect count | +| `rippled_job_count` | JobQueue.cpp:26 | Current job queue depth | +| `rippled_{category}_Bytes_In/Out` | OverlayImpl.h:535 | Overlay traffic bytes per category (57 categories) | +| `rippled_{category}_Messages_In/Out` | OverlayImpl.h:535 | Overlay traffic messages per category | + +#### Counters + +| Prometheus Metric | Source | Description | +| --------------------------------- | --------------------- | ------------------------------ | +| `rippled_rpc_requests` | ServerHandler.cpp:108 | Total RPC request count | +| `rippled_ledger_fetches` | InboundLedgers.cpp:44 | Ledger fetch request count | +| `rippled_ledger_history_mismatch` | LedgerHistory.cpp:16 | Ledger hash mismatch count | +| `rippled_warn` | Logic.h:33 | Resource manager warning count | +| `rippled_drop` | Logic.h:34 | Resource manager drop count | + +#### Histograms (from StatsD timers) + +| Prometheus Metric | Source | Description | +| ----------------------- | --------------------- | ------------------------------ | +| `rippled_rpc_time` | ServerHandler.cpp:110 | RPC response time (ms) | +| `rippled_rpc_size` | ServerHandler.cpp:109 | RPC response size (bytes) | +| `rippled_ios_latency` | Application.cpp:438 | I/O service loop latency (ms) | +| `rippled_pathfind_fast` | PathRequests.h:23 | Fast pathfinding duration (ms) | +| `rippled_pathfind_full` | PathRequests.h:24 | Full pathfinding duration (ms) | + ## Grafana Dashboards -Three dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: +Eight dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: ### RPC Performance (`xrpld-rpc-perf`) @@ -155,6 +227,10 @@ Three dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: | RPC Latency p95 by Command | timeseries | `histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])))` | `xrpl_rpc_command` | | RPC Error Rate | bargauge | Error spans / total spans × 100, grouped by `xrpl_rpc_command` | `xrpl_rpc_command`, `status_code` | | RPC Latency Heatmap | heatmap | `sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])) by (le)` | `le` (bucket boundaries) | +| Overall RPC Throughput | timeseries | `rpc.request` + `rpc.process` rate | — | +| RPC Success vs Error | timeseries | by `status_code` (UNSET vs ERROR) | `status_code` | +| Top Commands by Volume | bargauge | `topk(10, ...)` by `xrpl_rpc_command` | `xrpl_rpc_command` | +| WebSocket Message Rate | stat | `rpc.ws_message` rate | — | ### Transaction Overview (`xrpld-transactions`) @@ -164,32 +240,110 @@ Three dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: | Transaction Processing Latency | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="tx.process"})` | — | | Transaction Path Distribution | piechart | `sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m]))` | `xrpl_tx_local` | | Transaction Receive vs Suppressed | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.receive"}[5m])` | — | +| TX Processing Duration Heatmap | heatmap | `tx.process` histogram buckets | `le` | +| TX Apply Duration per Ledger | timeseries | p95/p50 of `tx.apply` | — | +| Peer TX Receive Rate | timeseries | `tx.receive` rate | — | +| TX Apply Failed Rate | stat | `tx.apply` with `STATUS_CODE_ERROR` | `status_code` | ### Consensus Health (`xrpld-consensus`) -| Panel | Type | PromQL | Labels Used | -| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | ----------- | -| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — | -| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — | -| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — | -| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — | -| Ledger Apply Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept.apply"})` | — | -| Close Time Agreement | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.accept.apply"}[5m])` | — | +| Panel | Type | PromQL | Labels Used | +| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | --------------------- | +| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — | +| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — | +| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — | +| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — | +| Ledger Apply Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept.apply"})` | — | +| Close Time Agreement | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.accept.apply"}[5m])` | — | +| Consensus Mode Over Time | timeseries | `consensus.ledger_close` by `xrpl_consensus_mode` | `xrpl_consensus_mode` | +| Accept vs Close Rate | timeseries | `consensus.accept` vs `consensus.ledger_close` rate | — | +| Validation vs Close Rate | timeseries | `consensus.validation.send` vs `consensus.ledger_close` | — | +| Accept Duration Heatmap | heatmap | `consensus.accept` histogram buckets | `le` | + +### Ledger Operations (`rippled-ledger-ops`) + +| Panel | Type | PromQL | Labels Used | +| ----------------------- | ---------- | ---------------------------------------------- | ----------- | +| Ledger Build Rate | stat | `ledger.build` call rate | — | +| Ledger Build Duration | timeseries | p95/p50 of `ledger.build` | — | +| Ledger Validation Rate | stat | `ledger.validate` call rate | — | +| Build Duration Heatmap | heatmap | `ledger.build` histogram buckets | `le` | +| TX Apply Duration | timeseries | p95/p50 of `tx.apply` | — | +| TX Apply Rate | timeseries | `tx.apply` call rate | — | +| Ledger Store Rate | stat | `ledger.store` call rate | — | +| Build vs Close Duration | timeseries | p95 `ledger.build` vs `consensus.ledger_close` | — | + +### Peer Network (`rippled-peer-net`) + +Requires `trace_peer=1` in the `[telemetry]` config section. + +| Panel | Type | PromQL | Labels Used | +| -------------------------------- | ---------- | --------------------------------- | ------------------------------ | +| Proposal Receive Rate | timeseries | `peer.proposal.receive` rate | — | +| Validation Receive Rate | timeseries | `peer.validation.receive` rate | — | +| Proposals Trusted vs Untrusted | piechart | by `xrpl_peer_proposal_trusted` | `xrpl_peer_proposal_trusted` | +| Validations Trusted vs Untrusted | piechart | by `xrpl_peer_validation_trusted` | `xrpl_peer_validation_trusted` | + +### Node Health — StatsD (`rippled-statsd-node-health`) + +| Panel | Type | PromQL | Labels Used | +| -------------------------- | ---------- | ------------------------------------------------------ | ----------- | +| Validated Ledger Age | stat | `rippled_LedgerMaster_Validated_Ledger_Age` | — | +| Published Ledger Age | stat | `rippled_LedgerMaster_Published_Ledger_Age` | — | +| Operating Mode Duration | timeseries | `rippled_State_Accounting_*_duration` | — | +| Operating Mode Transitions | timeseries | `rippled_State_Accounting_*_transitions` | — | +| I/O Latency | timeseries | `histogram_quantile(0.95, rippled_ios_latency_bucket)` | — | +| Job Queue Depth | timeseries | `rippled_job_count` | — | +| Ledger Fetch Rate | stat | `rate(rippled_ledger_fetches[5m])` | — | +| Ledger History Mismatches | stat | `rate(rippled_ledger_history_mismatch[5m])` | — | + +### Network Traffic — StatsD (`rippled-statsd-network`) + +| Panel | Type | PromQL | Labels Used | +| ---------------------- | ---------- | -------------------------------------- | ----------- | +| Active Peers | timeseries | `rippled_Peer_Finder_Active_*_Peers` | — | +| Peer Disconnects | timeseries | `rippled_Overlay_Peer_Disconnects` | — | +| Total Network Bytes | timeseries | `rippled_total_Bytes_In/Out` | — | +| Total Network Messages | timeseries | `rippled_total_Messages_In/Out` | — | +| Transaction Traffic | timeseries | `rippled_transactions_Messages_In/Out` | — | +| Proposal Traffic | timeseries | `rippled_proposals_Messages_In/Out` | — | +| Validation Traffic | timeseries | `rippled_validations_Messages_In/Out` | — | +| Traffic by Category | bargauge | `topk(10, rippled_*_Bytes_In)` | — | + +### RPC & Pathfinding — StatsD (`rippled-statsd-rpc`) + +| Panel | Type | PromQL | Labels Used | +| ------------------------- | ---------- | -------------------------------------------------------- | ----------- | +| RPC Request Rate | stat | `rate(rippled_rpc_requests[5m])` | — | +| RPC Response Time | timeseries | `histogram_quantile(0.95, rippled_rpc_time_bucket)` | — | +| RPC Response Size | timeseries | `histogram_quantile(0.95, rippled_rpc_size_bucket)` | — | +| RPC Response Time Heatmap | heatmap | `rippled_rpc_time_bucket` | — | +| Pathfinding Fast Duration | timeseries | `histogram_quantile(0.95, rippled_pathfind_fast_bucket)` | — | +| Pathfinding Full Duration | timeseries | `histogram_quantile(0.95, rippled_pathfind_full_bucket)` | — | +| Resource Warnings Rate | stat | `rate(rippled_warn[5m])` | — | +| Resource Drops Rate | stat | `rate(rippled_drop[5m])` | — | ### Span → Metric → Dashboard Summary | Span Name | Prometheus Metric Filter | Grafana Dashboard | | --------------------------- | ----------------------------------------- | --------------------------------------------- | -| `rpc.request` | `{span_name="rpc.request"}` | — (available but not paneled) | -| `rpc.process` | `{span_name="rpc.process"}` | — (available but not paneled) | -| `rpc.command.*` | `{span_name=~"rpc.command.*"}` | RPC Performance (all 4 panels) | -| `tx.process` | `{span_name="tx.process"}` | Transaction Overview (3 panels) | -| `tx.receive` | `{span_name="tx.receive"}` | Transaction Overview (2 panels) | -| `consensus.accept` | `{span_name="consensus.accept"}` | Consensus Health (Round Duration) | +| `rpc.request` | `{span_name="rpc.request"}` | RPC Performance (Overall Throughput) | +| `rpc.process` | `{span_name="rpc.process"}` | RPC Performance (Overall Throughput) | +| `rpc.ws_message` | `{span_name="rpc.ws_message"}` | RPC Performance (WebSocket Rate) | +| `rpc.command.*` | `{span_name=~"rpc.command.*"}` | RPC Performance (Rate, Latency, Error, Top) | +| `tx.process` | `{span_name="tx.process"}` | Transaction Overview (Rate, Latency, Heatmap) | +| `tx.receive` | `{span_name="tx.receive"}` | Transaction Overview (Rate, Receive) | +| `tx.apply` | `{span_name="tx.apply"}` | Transaction Overview + Ledger Ops (Apply) | +| `consensus.accept` | `{span_name="consensus.accept"}` | Consensus Health (Duration, Rate, Heatmap) | | `consensus.proposal.send` | `{span_name="consensus.proposal.send"}` | Consensus Health (Proposals Rate) | -| `consensus.ledger_close` | `{span_name="consensus.ledger_close"}` | Consensus Health (Close Duration) | +| `consensus.ledger_close` | `{span_name="consensus.ledger_close"}` | Consensus Health (Close, Mode) | | `consensus.validation.send` | `{span_name="consensus.validation.send"}` | Consensus Health (Validation Rate) | | `consensus.accept.apply` | `{span_name="consensus.accept.apply"}` | Consensus Health (Apply Duration, Close Time) | +| `ledger.build` | `{span_name="ledger.build"}` | Ledger Ops (Build Rate, Duration, Heatmap) | +| `ledger.validate` | `{span_name="ledger.validate"}` | Ledger Ops (Validation Rate) | +| `ledger.store` | `{span_name="ledger.store"}` | Ledger Ops (Store Rate) | +| `peer.proposal.receive` | `{span_name="peer.proposal.receive"}` | Peer Network (Rate, Trusted/Untrusted) | +| `peer.validation.receive` | `{span_name="peer.validation.receive"}` | Peer Network (Rate, Trusted/Untrusted) | ## Troubleshooting diff --git a/src/xrpld/app/ledger/detail/BuildLedger.cpp b/src/xrpld/app/ledger/detail/BuildLedger.cpp index 8f5184336a..d7221e2c21 100644 --- a/src/xrpld/app/ledger/detail/BuildLedger.cpp +++ b/src/xrpld/app/ledger/detail/BuildLedger.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -13,8 +14,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -41,6 +44,9 @@ buildLedgerImpl( beast::Journal j, ApplyTxs&& applyTxs) { + using namespace telemetry; + auto buildSpan = SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::build); + auto built = std::make_shared(*parent, closeTime); if (built->isFlagLedger()) @@ -74,6 +80,14 @@ buildLedgerImpl( built->header().seq < XRP_LEDGER_EARLIEST_FEES || built->read(keylet::fees()), "xrpl::buildLedgerImpl : valid ledger fees"); built->setAccepted(closeTime, closeResolution, closeTimeCorrect); + buildSpan.setAttribute(ledger_span::attr::seq, static_cast(built->header().seq)); + buildSpan.setAttribute( + ledger_span::attr::closeTime, static_cast(closeTime.time_since_epoch().count())); + buildSpan.setAttribute(ledger_span::attr::closeTimeCorrect, closeTimeCorrect); + buildSpan.setAttribute( + ledger_span::attr::closeResolutionMs, + static_cast( + std::chrono::duration_cast(closeResolution).count())); return built; } @@ -97,6 +111,9 @@ applyTransactions( OpenView& view, beast::Journal j) { + using namespace telemetry; + auto applySpan = SpanGuard::span(TraceCategory::Transactions, seg::tx, ledger_span::op::apply); + bool certainRetry = true; std::size_t count = 0; @@ -163,6 +180,8 @@ applyTransactions( // If there are any transactions left, we must have // tried them in at least one final pass XRPL_ASSERT(txns.empty() || !certainRetry, "xrpl::applyTransactions : retry transactions"); + applySpan.setAttribute(ledger_span::attr::txCount, static_cast(count)); + applySpan.setAttribute(ledger_span::attr::txFailed, static_cast(failed.size())); return count; } diff --git a/src/xrpld/app/ledger/detail/LedgerMaster.cpp b/src/xrpld/app/ledger/detail/LedgerMaster.cpp index c53249fa07..df62dc36f1 100644 --- a/src/xrpld/app/ledger/detail/LedgerMaster.cpp +++ b/src/xrpld/app/ledger/detail/LedgerMaster.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,7 @@ #include #include #include +#include #include @@ -449,6 +451,10 @@ LedgerMaster::fixIndex(LedgerIndex ledgerIndex, LedgerHash const& ledgerHash) bool LedgerMaster::storeLedger(std::shared_ptr ledger) { + using namespace telemetry; + auto span = SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::store); + span.setAttribute(ledger_span::attr::seq, static_cast(ledger->header().seq)); + bool const validated = ledger->header().validated; // Returns true if we already had the ledger return mLedgerHistory.insert(ledger, validated); @@ -965,6 +971,11 @@ LedgerMaster::checkAccept(std::shared_ptr const& ledger) return; } + using namespace telemetry; + auto valSpan = SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::validate); + valSpan.setAttribute(ledger_span::attr::seq, static_cast(ledger->header().seq)); + valSpan.setAttribute(ledger_span::attr::validations, static_cast(tvc)); + JLOG(m_journal.info()) << "Advancing accepted ledger to " << ledger->header().seq << " with >= " << minVal << " validations"; diff --git a/src/xrpld/app/ledger/detail/LedgerSpanNames.h b/src/xrpld/app/ledger/detail/LedgerSpanNames.h new file mode 100644 index 0000000000..f6b5af6c51 --- /dev/null +++ b/src/xrpld/app/ledger/detail/LedgerSpanNames.h @@ -0,0 +1,54 @@ +#pragma once + +/** Compile-time span name constants for ledger tracing. + * + * Used by BuildLedger and LedgerMaster for ledger lifecycle spans. + * Built on StaticStr/join() from SpanNames.h. + * + * Span hierarchy: + * + * ledger.build (BuildLedger — ledger construction) + * ledger.store (LedgerMaster — ledger storage) + * ledger.validate (LedgerMaster — ledger validation acceptance) + * tx.apply (BuildLedger — transaction application) + */ + +#include + +namespace xrpl { +namespace telemetry { +namespace ledger_span { + +// ===== Span operation suffixes =============================================== + +namespace op { +inline constexpr auto build = makeStr("build"); +inline constexpr auto store = makeStr("store"); +inline constexpr auto validate = makeStr("validate"); +inline constexpr auto apply = makeStr("apply"); +} // namespace op + +// ===== Attribute keys ======================================================== + +namespace attr { +inline constexpr auto xrplLedger = join(seg::xrpl, seg::ledger); + +/// "xrpl.ledger.seq" +inline constexpr auto seq = join(xrplLedger, makeStr("seq")); +/// "xrpl.ledger.close_time" +inline constexpr auto closeTime = join(xrplLedger, makeStr("close_time")); +/// "xrpl.ledger.close_time_correct" +inline constexpr auto closeTimeCorrect = join(xrplLedger, makeStr("close_time_correct")); +/// "xrpl.ledger.close_resolution_ms" +inline constexpr auto closeResolutionMs = join(xrplLedger, makeStr("close_resolution_ms")); +/// "xrpl.ledger.tx_count" +inline constexpr auto txCount = join(xrplLedger, makeStr("tx_count")); +/// "xrpl.ledger.tx_failed" +inline constexpr auto txFailed = join(xrplLedger, makeStr("tx_failed")); +/// "xrpl.ledger.validations" +inline constexpr auto validations = join(xrplLedger, makeStr("validations")); +} // namespace attr + +} // namespace ledger_span +} // namespace telemetry +} // namespace xrpl diff --git a/src/xrpld/overlay/detail/PeerImp.cpp b/src/xrpld/overlay/detail/PeerImp.cpp index 16f8484243..b7ed681049 100644 --- a/src/xrpld/overlay/detail/PeerImp.cpp +++ b/src/xrpld/overlay/detail/PeerImp.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1863,6 +1864,10 @@ PeerImp::onMessage(std::shared_ptr const& m) void PeerImp::onMessage(std::shared_ptr const& m) { + using namespace telemetry; + auto span = SpanGuard::span(TraceCategory::Peer, seg::peer, peer_span::op::proposalReceive); + span.setAttribute(peer_span::attr::id, static_cast(id_)); + protocol::TMProposeSet const& set = *m; auto const sig = makeSlice(set.signature()); @@ -1889,6 +1894,7 @@ PeerImp::onMessage(std::shared_ptr const& m) // every time a spam packet is received PublicKey const publicKey{makeSlice(set.nodepubkey())}; auto const isTrusted = app_.getValidators().trusted(publicKey); + span.setAttribute(peer_span::attr::proposalTrusted, isTrusted); // If the operator has specified that untrusted proposals be dropped then // this happens here I.e. before further wasting CPU verifying the signature @@ -2459,6 +2465,11 @@ PeerImp::onMessage(std::shared_ptr const& m void PeerImp::onMessage(std::shared_ptr const& m) { + using namespace telemetry; + auto valSpan = + SpanGuard::span(TraceCategory::Peer, seg::peer, peer_span::op::validationReceive); + valSpan.setAttribute(peer_span::attr::id, static_cast(id_)); + if (m->validation().size() < 50) { JLOG(p_journal_.warn()) << "Validation: Too small"; @@ -2481,6 +2492,9 @@ PeerImp::onMessage(std::shared_ptr const& m) false); val->setSeen(closeTime); } + valSpan.setAttribute( + peer_span::attr::validationLedgerHash, to_string(val->getLedgerHash()).c_str()); + valSpan.setAttribute(peer_span::attr::validationFull, val->isFull()); if (!isCurrent( app_.getValidations().parms(), @@ -2497,6 +2511,7 @@ PeerImp::onMessage(std::shared_ptr const& m) // suppression for 30 seconds to avoid doing a relatively expensive // lookup every time a spam packet is received auto const isTrusted = app_.getValidators().trusted(val->getSignerPublic()); + valSpan.setAttribute(peer_span::attr::validationTrusted, isTrusted); // If the operator has specified that untrusted validations be // dropped then this happens here I.e. before further wasting CPU diff --git a/src/xrpld/overlay/detail/PeerSpanNames.h b/src/xrpld/overlay/detail/PeerSpanNames.h new file mode 100644 index 0000000000..cbeeed528b --- /dev/null +++ b/src/xrpld/overlay/detail/PeerSpanNames.h @@ -0,0 +1,50 @@ +#pragma once + +/** Compile-time span name constants for peer overlay tracing. + * + * Used by PeerImp for peer message handling spans (proposals, + * validations). Built on StaticStr/join() from SpanNames.h. + * + * Span hierarchy: + * + * peer.proposal.receive (PeerImp — incoming proposal) + * peer.validation.receive (PeerImp — incoming validation) + */ + +#include + +namespace xrpl { +namespace telemetry { +namespace peer_span { + +// ===== Span operation suffixes =============================================== + +namespace op { +inline constexpr auto proposalReceive = makeStr("proposal.receive"); +inline constexpr auto validationReceive = makeStr("validation.receive"); +} // namespace op + +// ===== Attribute keys ======================================================== + +namespace attr { +inline constexpr auto xrplPeer = join(seg::xrpl, seg::peer); + +/// "xrpl.peer.id" +inline constexpr auto id = join(xrplPeer, makeStr("id")); +/// "xrpl.peer.proposal.trusted" +inline constexpr auto proposalTrusted = + join(join(xrplPeer, makeStr("proposal")), makeStr("trusted")); + +/// "xrpl.peer.validation.ledger_hash" +inline constexpr auto validationLedgerHash = + join(join(xrplPeer, makeStr("validation")), makeStr("ledger_hash")); +/// "xrpl.peer.validation.full" +inline constexpr auto validationFull = join(join(xrplPeer, makeStr("validation")), makeStr("full")); +/// "xrpl.peer.validation.trusted" +inline constexpr auto validationTrusted = + join(join(xrplPeer, makeStr("validation")), makeStr("trusted")); +} // namespace attr + +} // namespace peer_span +} // namespace telemetry +} // namespace xrpl