From 892fee638afcd5c7a9e1459d02ed8eaf0bee53a5 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 20 Mar 2026 17:23:06 +0000 Subject: [PATCH] Phase 9: Metric gap fill - nodestore, cache, TxQ, load factor dashboards Co-Authored-By: Claude Opus 4.6 --- .../scripts/levelization/results/loops.txt | 3 + .../scripts/levelization/results/ordering.txt | 7 +- OpenTelemetryPlan/06-implementation-phases.md | 313 ++++++++++- OpenTelemetryPlan/08-appendix.md | 64 ++- .../09-data-collection-reference.md | 335 +++++++++++- OpenTelemetryPlan/Phase10_taskList.md | 242 +++++++++ OpenTelemetryPlan/Phase11_taskList.md | 453 ++++++++++++++++ OpenTelemetryPlan/Phase9_taskList.md | 312 +++++++++++ .../dashboards/rippled-fee-market.json | 343 ++++++++++++ .../grafana/dashboards/rippled-job-queue.json | 395 ++++++++++++++ .../grafana/dashboards/rippled-rpc-perf.json | 404 ++++++++++++++ .../dashboards/system-node-health.json | 349 +++++++++++- docker/telemetry/integration-test.sh | 48 ++ include/xrpl/core/ServiceRegistry.h | 9 +- src/tests/libxrpl/CMakeLists.txt | 9 + .../libxrpl/telemetry/MetricsRegistry.cpp | 346 ++++++++++++ src/xrpld/app/main/Application.cpp | 37 ++ src/xrpld/perflog/detail/PerfLogImp.cpp | 18 + src/xrpld/telemetry/MetricsRegistry.cpp | 513 ++++++++++++++++++ src/xrpld/telemetry/MetricsRegistry.h | 284 ++++++++++ 20 files changed, 4453 insertions(+), 31 deletions(-) create mode 100644 OpenTelemetryPlan/Phase10_taskList.md create mode 100644 OpenTelemetryPlan/Phase11_taskList.md create mode 100644 OpenTelemetryPlan/Phase9_taskList.md create mode 100644 docker/telemetry/grafana/dashboards/rippled-fee-market.json create mode 100644 docker/telemetry/grafana/dashboards/rippled-job-queue.json create mode 100644 docker/telemetry/grafana/dashboards/rippled-rpc-perf.json create mode 100644 src/tests/libxrpl/telemetry/MetricsRegistry.cpp create mode 100644 src/xrpld/telemetry/MetricsRegistry.cpp create mode 100644 src/xrpld/telemetry/MetricsRegistry.h diff --git a/.github/scripts/levelization/results/loops.txt b/.github/scripts/levelization/results/loops.txt index 7914704f9d..1110b0b298 100644 --- a/.github/scripts/levelization/results/loops.txt +++ b/.github/scripts/levelization/results/loops.txt @@ -16,6 +16,9 @@ Loop: xrpld.app xrpld.rpc Loop: xrpld.app xrpld.shamap xrpld.shamap ~= xrpld.app +Loop: xrpld.app xrpld.telemetry + xrpld.telemetry ~= xrpld.app + Loop: xrpld.overlay xrpld.rpc xrpld.rpc ~= xrpld.overlay diff --git a/.github/scripts/levelization/results/ordering.txt b/.github/scripts/levelization/results/ordering.txt index 7a8023d61c..2e7ff014fd 100644 --- a/.github/scripts/levelization/results/ordering.txt +++ b/.github/scripts/levelization/results/ordering.txt @@ -180,6 +180,7 @@ test.toplevel > xrpl.json test.unit_test > xrpl.basics test.unit_test > xrpl.protocol tests.libxrpl > xrpl.basics +tests.libxrpl > xrpl.core tests.libxrpl > xrpld.telemetry tests.libxrpl > xrpl.json tests.libxrpl > xrpl.net @@ -229,7 +230,6 @@ xrpld.app > xrpl.basics xrpld.app > xrpl.core xrpld.app > xrpld.consensus xrpld.app > xrpld.core -xrpld.app > xrpld.telemetry xrpld.app > xrpl.json xrpld.app > xrpl.ledger xrpld.app > xrpl.net @@ -271,6 +271,7 @@ xrpld.peerfinder > xrpl.rdb xrpld.perflog > xrpl.basics xrpld.perflog > xrpl.core xrpld.perflog > xrpld.rpc +xrpld.perflog > xrpld.telemetry xrpld.perflog > xrpl.json xrpld.rpc > xrpl.basics xrpld.rpc > xrpl.core @@ -286,4 +287,8 @@ xrpld.rpc > xrpl.resource xrpld.rpc > xrpl.server xrpld.rpc > xrpl.tx xrpld.shamap > xrpl.shamap +xrpld.telemetry > xrpl.basics +xrpld.telemetry > xrpl.core +xrpld.telemetry > xrpl.nodestore +xrpld.telemetry > xrpl.server xrpld.telemetry > xrpl.telemetry diff --git a/OpenTelemetryPlan/06-implementation-phases.md b/OpenTelemetryPlan/06-implementation-phases.md index 1ae9ce59a3..75e62895c2 100644 --- a/OpenTelemetryPlan/06-implementation-phases.md +++ b/OpenTelemetryPlan/06-implementation-phases.md @@ -63,6 +63,15 @@ gantt section Phase 8 Log-Trace Correlation :p8, after p7, 1w + + section Phase 9 (Future) + Internal Metric Gap Fill :p9, after p8, 2.5w + + section Phase 10 (Future) + Workload Validation :p10, after p9, 2w + + section Phase 11 (Future) + Third-Party Collection :p11, after p10, 3w ``` --- @@ -656,6 +665,266 @@ flowchart LR --- +## 6.8.2 Phase 9: Internal Metric Instrumentation Gap Fill (Weeks 14-15) — Future Enhancement + +> **Status**: Planned, not yet implemented. + +### Motivation + +Phases 1-8 establish trace spans, StatsD metrics bridge, native OTel metrics, and log-trace correlation. However, ~50+ metrics that exist inside rippled's `get_counts`, `server_info`, TxQ, PerfLog, and `CountedObject` systems have **no time-series export path**. These are the metrics that exchanges, payment processors, analytics providers, validators, and researchers need most — NodeStore I/O performance, cache hit rates, per-RPC-method counters, transaction queue depth, fee escalation levels, and live object instance counts. + +### Architecture + +Hybrid approach — two instrumentation strategies based on proximity to existing code: + +```mermaid +flowchart TB + subgraph rippled["rippled process"] + subgraph existing["Existing beast::insight registrations"] + NS["NodeStore I/O
(Database.cpp)"] + end + subgraph newreg["New OTel MetricsRegistry"] + CR["Cache Hit Rates
(async gauge callbacks)"] + TQ["TxQ Metrics
(async gauge callbacks)"] + PL["PerfLog RPC/Job
(counters + histograms)"] + CO["CountedObjects
(async gauge callbacks)"] + LF["Load Factors
(async gauge callbacks)"] + end + end + + subgraph export["Export Pipelines"] + BI["beast::insight
OTelCollector (Phase 7)"] + OS["OTel Metrics SDK
PeriodicMetricReader"] + end + + NS --> BI + CR --> OS + TQ --> OS + PL --> OS + CO --> OS + LF --> OS + + BI --> OTLP["OTLP/HTTP :4318
/v1/metrics"] + OS --> OTLP + + style rippled fill:#1a2633,color:#ccc,stroke:#4a90d9 + style existing fill:#2a4a6b,color:#fff,stroke:#4a90d9 + style newreg fill:#2a4a6b,color:#fff,stroke:#4a90d9 + style export fill:#1a3320,color:#ccc,stroke:#5cb85c + style NS fill:#4a90d9,color:#fff,stroke:#2a6db5 + style CR fill:#5cb85c,color:#fff,stroke:#3d8b3d + style TQ fill:#5cb85c,color:#fff,stroke:#3d8b3d + style PL fill:#5cb85c,color:#fff,stroke:#3d8b3d + style CO fill:#5cb85c,color:#fff,stroke:#3d8b3d + style LF fill:#5cb85c,color:#fff,stroke:#3d8b3d + style BI fill:#449d44,color:#fff,stroke:#2d6e2d + style OS fill:#449d44,color:#fff,stroke:#2d6e2d + style OTLP fill:#f0ad4e,color:#000,stroke:#c78c2e +``` + +- **beast::insight extensions** (blue): NodeStore I/O metrics added near existing `Database.cpp` registrations — exported via Phase 7's `OTelCollector`. +- **OTel MetricsRegistry** (green): New centralized class using `ObservableGauge` async callbacks for cache, TxQ, PerfLog, CountedObjects, and load factors — polled at 10s intervals by `PeriodicMetricReader`. + +### Third-Party Consumer Context + +| Consumer Category | Key Metrics They Need From Phase 9 | +| ---------------------- | --------------------------------------------------------------- | +| Exchanges | Fee escalation levels, TxQ depth, settlement latency | +| Payment Processors | Load factors, io_latency, transaction throughput | +| Analytics Providers | NodeStore I/O, cache hit rates, counted objects | +| Validators / Operators | Per-job execution times, PerfLog RPC counters, consensus timing | +| Academic Researchers | Consensus performance time-series, fee market dynamics | +| Institutional Custody | Server health scores, reserve calculations, node availability | + +### Tasks + +| Task | Description | +| ---- | ----------------------------------------- | +| 9.1 | NodeStore I/O metrics | +| 9.2 | Cache hit rate metrics + MetricsRegistry | +| 9.3 | TxQ metrics | +| 9.4 | PerfLog per-RPC metrics | +| 9.5 | PerfLog per-job metrics | +| 9.6 | Counted object instance metrics | +| 9.7 | Fee escalation & load factor metrics | +| 9.8 | New Grafana dashboards (2 new, 2 updated) | +| 9.9 | Update documentation | +| 9.10 | Integration tests | + +See [Phase9_taskList.md](./Phase9_taskList.md) for detailed per-task breakdown. + +### Exit Criteria + +- [ ] All ~50 new metrics visible in Prometheus via OTLP pipeline +- [ ] `MetricsRegistry` class registers/deregisters cleanly with OTel SDK +- [ ] 2 new Grafana dashboards operational (Fee Market, Job Queue) +- [ ] No performance regression (< 0.5% CPU overhead from new callbacks) +- [ ] Documentation updated with full new metric inventory + +--- + +## 6.8.3 Phase 10: Synthetic Workload Generation & Telemetry Validation (Weeks 16-17) — Future Enhancement + +> **Status**: Planned, not yet implemented. + +### Motivation + +Before the telemetry stack (Phases 1-9) can be considered production-ready, we need automated proof that all 16 spans, 22 attributes, 300+ metrics, 10 Grafana dashboards, and log-trace correlation work correctly under realistic load. This phase establishes a reusable CI-integrated validation suite and performance benchmark baseline. + +### Architecture + +```mermaid +flowchart LR + subgraph harness["Docker Compose Workload Harness"] + direction TB + V1["Validator 1"] ~~~ V2["Validator 2"] ~~~ V3["Validator 3"] + V4["Validator 4"] ~~~ V5["Validator 5"] + end + + subgraph generators["Workload Generators"] + RPC["RPC Load Generator
(configurable RPS,
command distribution)"] + TX["Transaction Submitter
(Payment, Offer, NFT,
Escrow, AMM mix)"] + end + + subgraph validation["Validation Suite"] + SV["Span Validator
(Jaeger/Tempo API)"] + MV["Metric Validator
(Prometheus API)"] + LV["Log-Trace Validator
(Loki API)"] + DV["Dashboard Validator
(Grafana API)"] + BM["Benchmark Suite
(CPU, memory, latency
ON vs OFF comparison)"] + end + + generators --> harness + harness --> validation + + style harness fill:#1a2633,color:#ccc,stroke:#4a90d9 + style generators fill:#1a3320,color:#ccc,stroke:#5cb85c + style validation fill:#332a1a,color:#ccc,stroke:#f0ad4e + style V1 fill:#4a90d9,color:#fff,stroke:#2a6db5 + style V2 fill:#4a90d9,color:#fff,stroke:#2a6db5 + style V3 fill:#4a90d9,color:#fff,stroke:#2a6db5 + style V4 fill:#4a90d9,color:#fff,stroke:#2a6db5 + style V5 fill:#4a90d9,color:#fff,stroke:#2a6db5 + style RPC fill:#5cb85c,color:#fff,stroke:#3d8b3d + style TX fill:#5cb85c,color:#fff,stroke:#3d8b3d + style SV fill:#f0ad4e,color:#000,stroke:#c78c2e + style MV fill:#f0ad4e,color:#000,stroke:#c78c2e + style LV fill:#f0ad4e,color:#000,stroke:#c78c2e + style DV fill:#f0ad4e,color:#000,stroke:#c78c2e + style BM fill:#f0ad4e,color:#000,stroke:#c78c2e +``` + +### Tasks + +| Task | Description | +| ---- | -------------------------------------- | +| 10.1 | Multi-node test harness (5 validators) | +| 10.2 | RPC load generator | +| 10.3 | Transaction submitter (6+ tx types) | +| 10.4 | Telemetry validation suite | +| 10.5 | Performance benchmark suite | +| 10.6 | CI integration | +| 10.7 | Documentation | + +See [Phase10_taskList.md](./Phase10_taskList.md) for detailed per-task breakdown. + +### Exit Criteria + +- [ ] 5-node validator cluster starts and reaches consensus in docker-compose +- [ ] Validation suite confirms all 16 spans, 22 attributes, 300+ metrics +- [ ] All 10 Grafana dashboards render data (no empty panels) +- [ ] Benchmark shows < 3% CPU overhead, < 5MB memory overhead +- [ ] CI workflow runs validation on telemetry branch changes + +--- + +## 6.8.4 Phase 11: Third-Party Data Collection Pipelines (Weeks 18-20) — Future Enhancement + +> **Status**: Planned, not yet implemented. + +### Motivation + +rippled has no native Prometheus/OTLP metrics export for data accessible only via JSON-RPC (`server_info`, `get_counts`, `fee`, `peers`, `validators`, `feature`). Every external consumer — exchanges, payment processors, analytics providers, validators, compliance firms, DeFi protocols, researchers, custodians, and CBDC platforms — must build custom JSON-RPC polling and conversion pipelines. This phase centralizes that work into a reusable custom OTel Collector receiver. + +### Architecture + +```mermaid +flowchart LR + subgraph receiver["Custom OTel Collector Receiver (Go)"] + direction TB + SI["server_info
collector"] + GC["get_counts
collector"] + FE["fee
collector"] + PE["peers
collector"] + VA["validators
collector"] + DX["DEX/AMM
collector
(optional)"] + end + + rippled["rippled
Admin RPC
:5005"] -->|"JSON-RPC
poll every 30s"| receiver + + receiver -->|"xrpl_* metrics"| PROM["Prometheus
:9090"] + receiver -->|"OTLP export"| OTLP["Any OTLP-
compatible
backend"] + + PROM --> GF["Grafana
4 new dashboards"] + PROM --> AL["Prometheus
Alerting Rules"] + + style receiver fill:#1a3320,color:#ccc,stroke:#5cb85c + style SI fill:#5cb85c,color:#fff,stroke:#3d8b3d + style GC fill:#5cb85c,color:#fff,stroke:#3d8b3d + style FE fill:#5cb85c,color:#fff,stroke:#3d8b3d + style PE fill:#5cb85c,color:#fff,stroke:#3d8b3d + style VA fill:#5cb85c,color:#fff,stroke:#3d8b3d + style DX fill:#449d44,color:#fff,stroke:#2d6e2d + style rippled fill:#4a90d9,color:#fff,stroke:#2a6db5 + style PROM fill:#f0ad4e,color:#000,stroke:#c78c2e + style OTLP fill:#f0ad4e,color:#000,stroke:#c78c2e + style GF fill:#5bc0de,color:#000,stroke:#3aa8c1 + style AL fill:#d9534f,color:#fff,stroke:#b52d2d +``` + +### Third-Party Consumer Gap Analysis + +| Consumer Category | Data Unlocked by Phase 11 | +| ---------------------- | ------------------------------------------------------------ | +| Exchanges | Real-time fee estimates, TxQ capacity, server health scores | +| Payment Processors | Settlement latency percentiles, corridor health | +| Analytics Providers | Validator metrics, network topology, amendment voting status | +| DeFi / AMM | AMM pool TVL, DEX order book depth, trade volumes | +| Validators / Operators | Per-peer latency, version distribution, UNL health, alerting | +| Compliance | Transaction volume trends, network growth metrics | +| Academic Researchers | Consensus performance time-series, decentralization metrics | +| CBDC / Tokenization | Token supply tracking, trust line adoption, freeze status | +| Institutional Custody | Multi-sig status, escrow tracking, reserve calculations | +| Wallet Providers | Server health for node selection, fee prediction data | + +### Tasks + +| Task | Description | +| ----- | ------------------------------------- | +| 11.1 | OTel Collector receiver scaffold (Go) | +| 11.2 | server_info / server_state collector | +| 11.3 | get_counts collector | +| 11.4 | Peer topology collector | +| 11.5 | Validator & amendment collector | +| 11.6 | Fee & TxQ collector | +| 11.7 | DEX & AMM collector (optional) | +| 11.8 | Prometheus alerting rules | +| 11.9 | New Grafana dashboards (4) | +| 11.10 | Integration with Phase 10 validation | +| 11.11 | Documentation | + +See [Phase11_taskList.md](./Phase11_taskList.md) for detailed per-task breakdown. + +### Exit Criteria + +- [ ] Custom OTel Collector receiver exports all `xrpl_*` metrics to Prometheus +- [ ] 4 new Grafana dashboards operational (Validator Health, Network Topology, Fee Market, DEX/AMM) +- [ ] Prometheus alerting rules fire correctly for simulated failures +- [ ] Receiver handles rippled restart/unavailability gracefully +- [ ] Go receiver has unit tests with >80% coverage + +--- + ## 6.9 Risk Assessment ```mermaid @@ -853,14 +1122,13 @@ quadrantChart --- - -## 6.13 Definition of Done +## 6.12 Definition of Done > **TxQ** = Transaction Queue | **HA** = High Availability Clear, measurable criteria for each phase. -### 6.13.1 Phase 1: Core Infrastructure +### 6.12.1 Phase 1: Core Infrastructure | Criterion | Measurement | Target | @@ -873,8 +1141,7 @@ Clear, measurable criteria for each phase. **Definition of Done**: All criteria met, PR merged, no regressions in CI. - -### 6.13.2 Phase 2: RPC Tracing +### 6.12.2 Phase 2: RPC Tracing | Criterion | Measurement | Target | @@ -888,7 +1155,7 @@ Clear, measurable criteria for each phase. **Definition of Done**: RPC traces visible in Tempo for all commands, dashboard shows latency distribution. -### 6.13.3 Phase 3: Transaction Tracing +### 6.12.3 Phase 3: Transaction Tracing | Criterion | Measurement | Target | @@ -901,8 +1168,7 @@ Clear, measurable criteria for each phase. **Definition of Done**: Transaction traces span 3+ nodes in test network, performance within bounds. - -### 6.13.4 Phase 4: Consensus Tracing +### 6.12.4 Phase 4: Consensus Tracing | Criterion | Measurement | Target | @@ -915,8 +1181,7 @@ Clear, measurable criteria for each phase. **Definition of Done**: Consensus rounds fully traceable, no impact on consensus timing. - -### 6.13.5 Phase 5: Production Deployment +### 6.12.5 Phase 5: Production Deployment | Criterion | Measurement | Target | @@ -930,23 +1195,25 @@ Clear, measurable criteria for each phase. **Definition of Done**: Telemetry running in production, operators trained, alerts active. +### 6.12.6 Success Metrics Summary -### 6.13.6 Success Metrics Summary - -| Phase | Primary Metric | Secondary Metric | Deadline | -| ------- | ---------------------------- | --------------------------- | -------------- | -| Phase 1 | SDK compiles and runs | Zero overhead when disabled | End of Week 2 | -| Phase 2 | 100% RPC coverage | <1ms latency overhead | End of Week 4 | -| Phase 3 | Cross-node traces work | <5% throughput impact | End of Week 6 | -| Phase 4 | Consensus fully traced | No consensus timing impact | End of Week 8 | -| Phase 5 | Production deployment | Operators trained | End of Week 9 | -| Phase 6 | StatsD metrics in Prometheus | 3 dashboards operational | End of Week 10 | -| Phase 7 | All metrics via OTLP | No StatsD dependency | End of Week 12 | -| Phase 8 | trace_id in logs + Loki | Tempo↔Loki correlation | End of Week 13 | +| Phase | Primary Metric | Secondary Metric | Deadline | Status | +| -------- | -------------------------------- | --------------------------- | -------------- | ------------------ | +| Phase 1 | SDK compiles and runs | Zero overhead when disabled | End of Week 2 | Active | +| Phase 2 | 100% RPC coverage | <1ms latency overhead | End of Week 4 | Active | +| Phase 3 | Cross-node traces work | <5% throughput impact | End of Week 6 | Active | +| Phase 4 | Consensus fully traced | No consensus timing impact | End of Week 8 | Active | +| Phase 5 | Production deployment | Operators trained | End of Week 9 | Active | +| Phase 6 | StatsD metrics in Prometheus | 3 dashboards operational | End of Week 10 | Active | +| Phase 7 | All metrics via OTLP | No StatsD dependency | End of Week 12 | Active | +| Phase 8 | trace_id in logs + Loki | Tempo↔Loki correlation | End of Week 13 | Active | +| Phase 9 | 50+ new internal metrics in Prom | 2 new dashboards | End of Week 15 | Future Enhancement | +| Phase 10 | Full telemetry stack validated | < 3% CPU overhead proven | End of Week 17 | Future Enhancement | +| Phase 11 | Third-party metrics via receiver | 4 new dashboards + alerting | End of Week 20 | Future Enhancement | --- -## 6.14 Recommended Implementation Order +## 6.13 Recommended Implementation Order Based on ROI analysis, implement in this exact order: diff --git a/OpenTelemetryPlan/08-appendix.md b/OpenTelemetryPlan/08-appendix.md index 0b64b19067..b6e12fd318 100644 --- a/OpenTelemetryPlan/08-appendix.md +++ b/OpenTelemetryPlan/08-appendix.md @@ -43,6 +43,18 @@ | **LoadManager** | Dynamic fee escalation based on network load | | **SHAMap** | SHA-256 hash-based map (Merkle trie variant) for ledger state | +### Phase 9–11 Terms + +| Term | Definition | +| --------------------------- | ------------------------------------------------------------------------- | +| **MetricsRegistry** | Centralized class for OTel async gauge registrations (Phase 9) | +| **ObservableGauge** | OTel Metrics SDK async instrument polled via callback at fixed intervals | +| **PeriodicMetricReader** | OTel SDK component that invokes gauge callbacks at configurable intervals | +| **CountedObject** | rippled template that tracks live instance counts via atomic counters | +| **TxQ** | Transaction queue managing fee escalation and ordering | +| **Load Factor** | Combined multiplier affecting transaction cost (local, cluster, network) | +| **OTel Collector Receiver** | Custom Go plugin that polls rippled RPC and emits OTel metrics (Phase 11) | + --- ## 8.2 Span Hierarchy Visualization @@ -162,7 +174,8 @@ flowchart TB | ------- | ---------- | ------ | -------------------------------------------------------------- | | 1.0 | 2026-02-12 | - | Initial implementation plan | | 1.1 | 2026-02-13 | - | Refactored into modular documents | -| 1.2 | 2026-03-24 | - | Review fixes: accuracy corrections, cross-document consistency | +| 1.2 | 2026-03-09 | - | Added Phases 9–11 (future enhancement plans) | +| 1.3 | 2026-03-24 | - | Review fixes: accuracy corrections, cross-document consistency | --- @@ -197,8 +210,57 @@ flowchart TB | [Phase5_IntegrationTest_taskList.md](./Phase5_IntegrationTest_taskList.md) | Observability stack integration tests | | [Phase7_taskList.md](./Phase7_taskList.md) | Native OTel metrics migration | | [Phase8_taskList.md](./Phase8_taskList.md) | Log-trace correlation | +| [Phase9_taskList.md](./Phase9_taskList.md) | Internal metric instrumentation gap fill (future) | +| [Phase10_taskList.md](./Phase10_taskList.md) | Synthetic workload generation & validation (future) | +| [Phase11_taskList.md](./Phase11_taskList.md) | Third-party data collection pipelines (future) | | [presentation.md](./presentation.md) | Presentation slides for OpenTelemetry plan overview | +> **Note**: Phases 1 and 6 do not have separate task list files. Phase 1 tasks are documented in [06-implementation-phases.md §6.2](./06-implementation-phases.md). Phase 6 tasks are documented in [06-implementation-phases.md §6.7](./06-implementation-phases.md). + +--- + +## 8.6 Phase 9–11 Cross-Reference Guide + +This guide maps Phase 9–11 content to its location across the documentation. + +### Phase 9: Internal Metric Instrumentation Gap Fill + +| Content | Location | +| ------------------------------- | ------------------------------------------------------------------------ | +| Plan & architecture | [06-implementation-phases.md §6.8.2](./06-implementation-phases.md) | +| Task list (10 tasks) | [Phase9_taskList.md](./Phase9_taskList.md) | +| Future metric definitions (~50) | [09-data-collection-reference.md §5b](./09-data-collection-reference.md) | +| New class: `MetricsRegistry` | `src/xrpld/telemetry/MetricsRegistry.h/.cpp` (planned) | +| New dashboards | `rippled-fee-market`, `rippled-job-queue` (planned) | + +**Metric categories**: NodeStore I/O, Cache Hit Rates, TxQ, PerfLog Per-RPC, PerfLog Per-Job, Counted Objects, Fee Escalation & Load Factors. + +### Phase 10: Synthetic Workload Generation & Telemetry Validation + +| Content | Location | +| -------------------- | ------------------------------------------------------------------------ | +| Plan & architecture | [06-implementation-phases.md §6.8.3](./06-implementation-phases.md) | +| Task list (7 tasks) | [Phase10_taskList.md](./Phase10_taskList.md) | +| Validation inventory | [09-data-collection-reference.md §5c](./09-data-collection-reference.md) | +| Test harness | `docker/telemetry/docker-compose.workload.yaml` (planned) | +| CI workflow | `.github/workflows/telemetry-validation.yml` (planned) | + +**Validates**: 16 spans, 22 attributes, 300+ metrics, 10 dashboards, log-trace correlation. + +### Phase 11: Third-Party Data Collection Pipelines + +| Content | Location | +| --------------------------------- | ------------------------------------------------------------------------ | +| Plan & architecture | [06-implementation-phases.md §6.8.4](./06-implementation-phases.md) | +| Task list (11 tasks) | [Phase11_taskList.md](./Phase11_taskList.md) | +| External metric definitions (~30) | [09-data-collection-reference.md §5d](./09-data-collection-reference.md) | +| Custom OTel Collector receiver | `docker/telemetry/otel-rippled-receiver/` (planned) | +| Prometheus alerting rules (11) | [09-data-collection-reference.md §5d](./09-data-collection-reference.md) | +| New dashboards (4) | Validator Health, Network Topology, Fee Market (External), DEX & AMM | + +**Consumer categories**: Exchanges, Payment Processors, DeFi/AMM, NFT Marketplaces, Analytics Providers, Wallets, Compliance, Academic Researchers, Institutional Custody, CBDC Bridge Operators. +>>>>>>> 58b5170180 (Phase 9: Metric gap fill - nodestore, cache, TxQ, load factor dashboards) + --- _Previous: [Observability Backends](./07-observability-backends.md)_ | _Back to: [Overview](./OpenTelemetryPlan.md)_ diff --git a/OpenTelemetryPlan/09-data-collection-reference.md b/OpenTelemetryPlan/09-data-collection-reference.md index 0da5148d4c..e208c38e09 100644 --- a/OpenTelemetryPlan/09-data-collection-reference.md +++ b/OpenTelemetryPlan/09-data-collection-reference.md @@ -11,6 +11,7 @@ graph LR subgraph rippledNode["rippled Node"] A["Trace Macros
XRPL_TRACE_SPAN
(OTLP/HTTP exporter)"] B["beast::insight
OTel native metrics
(OTLP/HTTP exporter)"] + C["MetricsRegistry
OTel SDK metrics
(OTLP/HTTP exporter)"] end subgraph collector["OTel Collector :4317 / :4318"] @@ -32,11 +33,12 @@ graph LR end subgraph viz["Visualization"] - F["Grafana :3000
10 dashboards"] + F["Grafana :3000
13 dashboards"] end A -->|"OTLP/HTTP :4318
(traces + attributes)"| R1 B -->|"OTLP/HTTP :4318
(gauges, counters, histograms)"| R1 + C -->|"OTLP/HTTP :4318
(counters, histograms,
observable gauges)"| R1 BP -->|"OTLP/gRPC :4317"| D @@ -563,6 +565,337 @@ count_over_time({job="rippled"} |= "trace_id=" [5m]) --- +## 5b. Future: Internal Metric Gap Fill (Phase 9) + +> **Status**: Planned, not yet implemented. +> **Plan details**: [06-implementation-phases.md §6.8.2](./06-implementation-phases.md) — motivation, architecture, third-party context +> **Task breakdown**: [Phase9_taskList.md](./Phase9_taskList.md) — per-task implementation details + +Phase 9 fills ~50+ metrics that exist inside rippled but currently lack time-series export. Uses a hybrid approach: `beast::insight` extensions for NodeStore I/O, OTel `ObservableGauge` async callbacks for new categories. + +### New Metric Categories + +#### NodeStore I/O (via beast::insight) + +| Prometheus Metric | Type | Description | +| ------------------------------------ | ----- | ----------------------------------- | +| `rippled_nodestore_reads_total` | Gauge | Cumulative read operations | +| `rippled_nodestore_reads_hit` | Gauge | Cache-served reads | +| `rippled_nodestore_writes` | Gauge | Cumulative write operations | +| `rippled_nodestore_written_bytes` | Gauge | Cumulative bytes written | +| `rippled_nodestore_read_bytes` | Gauge | Cumulative bytes read | +| `rippled_nodestore_read_duration_us` | Gauge | Cumulative read time (microseconds) | +| `rippled_nodestore_write_load` | Gauge | Current write load score | +| `rippled_nodestore_read_queue` | Gauge | Items in read queue | + +#### Cache Hit Rates (via OTel MetricsRegistry) + +| Prometheus Metric | Type | Description | +| ------------------------------- | ----- | ------------------------------------ | +| `rippled_cache_SLE_hit_rate` | Gauge | SLE cache hit rate (0.0-1.0) | +| `rippled_cache_ledger_hit_rate` | Gauge | Ledger object cache hit rate | +| `rippled_cache_AL_hit_rate` | Gauge | AcceptedLedger cache hit rate | +| `rippled_cache_treenode_size` | Gauge | SHAMap TreeNode cache size (entries) | +| `rippled_cache_fullbelow_size` | Gauge | FullBelow cache size | + +#### Transaction Queue (via OTel MetricsRegistry) + +| Prometheus Metric | Type | Description | +| -------------------------------------- | ----- | -------------------------------- | +| `rippled_txq_count` | Gauge | Current transactions in queue | +| `rippled_txq_max_size` | Gauge | Maximum queue capacity | +| `rippled_txq_in_ledger` | Gauge | Transactions in open ledger | +| `rippled_txq_per_ledger` | Gauge | Expected transactions per ledger | +| `rippled_txq_open_ledger_fee_level` | Gauge | Open ledger fee escalation level | +| `rippled_txq_med_fee_level` | Gauge | Median fee level in queue | +| `rippled_txq_reference_fee_level` | Gauge | Reference fee level | +| `rippled_txq_min_processing_fee_level` | Gauge | Minimum fee to get processed | + +#### PerfLog Per-RPC Method (via OTel Metrics SDK) + +| Prometheus Metric | Type | Labels | Description | +| --------------------------------------- | --------- | ----------------- | --------------------------- | +| `rippled_rpc_method_started_total` | Counter | `method=""` | RPC calls started | +| `rippled_rpc_method_finished_total` | Counter | `method=""` | RPC calls completed | +| `rippled_rpc_method_errored_total` | Counter | `method=""` | RPC calls errored | +| `rippled_rpc_method_duration_us_bucket` | Histogram | `method=""` | Execution time distribution | + +#### PerfLog Per-Job Type (via OTel Metrics SDK) + +| Prometheus Metric | Type | Labels | Description | +| ---------------------------------------- | --------- | ------------------- | --------------- | +| `rippled_job_queued_total` | Counter | `job_type=""` | Jobs queued | +| `rippled_job_started_total` | Counter | `job_type=""` | Jobs started | +| `rippled_job_finished_total` | Counter | `job_type=""` | Jobs completed | +| `rippled_job_queued_duration_us_bucket` | Histogram | `job_type=""` | Queue wait time | +| `rippled_job_running_duration_us_bucket` | Histogram | `job_type=""` | Execution time | + +#### Counted Object Instances (via OTel MetricsRegistry) + +| Prometheus Metric | Type | Labels | Description | +| ---------------------- | ----- | --------------- | ------------------------------- | +| `rippled_object_count` | Gauge | `type=""` | Live instances of internal type | + +Tracked types: `Transaction`, `Ledger`, `NodeObject`, `STTx`, `STLedgerEntry`, `InboundLedger`, `Pathfinder`, `PathRequest`, `HashRouterEntry` + +#### Fee Escalation & Load Factors (via OTel MetricsRegistry) + +| Prometheus Metric | Type | Description | +| ------------------------------------ | ----- | ------------------------------------ | +| `rippled_load_factor` | Gauge | Combined transaction cost multiplier | +| `rippled_load_factor_server` | Gauge | Server + cluster + network load | +| `rippled_load_factor_local` | Gauge | Local server load only | +| `rippled_load_factor_net` | Gauge | Network-wide load estimate | +| `rippled_load_factor_cluster` | Gauge | Cluster peer load | +| `rippled_load_factor_fee_escalation` | Gauge | Open ledger fee escalation | +| `rippled_load_factor_fee_queue` | Gauge | Queue entry fee level | + +### New Grafana Dashboards (Phase 9) + +| Dashboard | UID | Data Source | Key Panels | +| ------------------ | -------------------- | ----------- | ----------------------------------------------------------------- | +| Fee Market & TxQ | `rippled-fee-market` | Prometheus | TxQ depth/capacity, fee levels, load factor breakdown, escalation | +| Job Queue Analysis | `rippled-job-queue` | Prometheus | Per-job rates, queue wait times, execution times, queue depth | + +--- + +## 5c. Future: Synthetic Workload Generation & Telemetry Validation (Phase 10) + +> **Status**: Planned, not yet implemented. +> **Plan details**: [06-implementation-phases.md §6.8.3](./06-implementation-phases.md) — motivation, architecture +> **Task breakdown**: [Phase10_taskList.md](./Phase10_taskList.md) — per-task implementation details + +Phase 10 builds a 5-node validator docker-compose harness with RPC load generators, transaction submitters, and automated validation scripts that verify all spans, metrics, dashboards, and log-trace correlation work end-to-end. Includes a benchmark suite comparing telemetry-ON vs telemetry-OFF overhead. + +### Validated Telemetry Inventory + +| Category | Expected Count | Validation Method | +| ------------------ | -------------- | -------------------------------- | +| Trace spans | 16 | Jaeger/Tempo API query | +| Span attributes | 22 | Per-span attribute assertion | +| StatsD metrics | 255+ | Prometheus query | +| Phase 9 metrics | 50+ | Prometheus query | +| SpanMetrics RED | 4 per span | Prometheus query | +| Grafana dashboards | 10 | Dashboard API "no data" check | +| Log-trace links | Present | Loki query + Tempo reverse check | + +--- + +## 5d. Future: Third-Party Data Collection Pipelines (Phase 11) + +> **Status**: Planned, not yet implemented. +> **Plan details**: [06-implementation-phases.md §6.8.4](./06-implementation-phases.md) — motivation, architecture, consumer gap analysis +> **Task breakdown**: [Phase11_taskList.md](./Phase11_taskList.md) — per-task implementation details + +Phase 11 builds a custom OTel Collector receiver (Go) that polls rippled's admin RPCs and exports `xrpl_*` metrics for external consumers. No rippled code changes. + +### Exported Metrics (via Custom OTel Collector Receiver) + +#### Node Health (from server_info) + +| Prometheus Metric | Type | Description | +| --------------------------------------- | ----- | ----------------------------------------------- | +| `xrpl_server_state` | Gauge | Operating mode (0=disconnected ... 5=proposing) | +| `xrpl_server_state_duration_seconds` | Gauge | Seconds in current state | +| `xrpl_uptime_seconds` | Gauge | Consecutive seconds running | +| `xrpl_io_latency_ms` | Gauge | I/O subsystem latency | +| `xrpl_amendment_blocked` | Gauge | 1 if amendment-blocked, 0 otherwise | +| `xrpl_peers_count` | Gauge | Connected peers | +| `xrpl_validated_ledger_seq` | Gauge | Latest validated ledger sequence | +| `xrpl_validated_ledger_age_seconds` | Gauge | Seconds since last validated close | +| `xrpl_last_close_proposers` | Gauge | Proposers in last consensus round | +| `xrpl_last_close_converge_time_seconds` | Gauge | Last consensus round duration | +| `xrpl_load_factor` | Gauge | Transaction cost multiplier | +| `xrpl_state_duration_seconds` | Gauge | Per-state duration (`state` label) | +| `xrpl_state_transitions_total` | Gauge | Per-state transition count (`state` label) | + +#### Peer Topology (from peers) + +| Prometheus Metric | Type | Description | +| --------------------------- | ----- | ----------------------------------- | +| `xrpl_peers_inbound_count` | Gauge | Inbound peer connections | +| `xrpl_peers_outbound_count` | Gauge | Outbound peer connections | +| `xrpl_peer_latency_p50_ms` | Gauge | Median peer latency | +| `xrpl_peer_latency_p95_ms` | Gauge | p95 peer latency | +| `xrpl_peer_version_count` | Gauge | Peers per version (`version` label) | +| `xrpl_peer_diverged_count` | Gauge | Peers with diverged tracking status | + +#### Validator & Amendment (from validators, feature) + +| Prometheus Metric | Type | Description | +| ------------------------------------- | ----- | --------------------------------------- | +| `xrpl_trusted_validators_count` | Gauge | UNL validator count | +| `xrpl_amendment_enabled_count` | Gauge | Enabled amendments | +| `xrpl_amendment_majority_count` | Gauge | Amendments with majority | +| `xrpl_amendment_unsupported_majority` | Gauge | 1 if unsupported amendment has majority | +| `xrpl_validator_list_active` | Gauge | 1 if validator list is active | + +#### Fee Market (from fee) + +| Prometheus Metric | Type | Description | +| -------------------------------- | ----- | ------------------------------------- | +| `xrpl_fee_open_ledger_fee_drops` | Gauge | Minimum fee for open ledger inclusion | +| `xrpl_fee_median_fee_drops` | Gauge | Median fee level | +| `xrpl_fee_queue_size` | Gauge | Current transaction queue depth | +| `xrpl_fee_current_ledger_size` | Gauge | Transactions in current open ledger | + +#### DEX & AMM (optional, from book_offers, amm_info) + +| Prometheus Metric | Type | Labels | Description | +| -------------------------- | ----- | --------------------- | ---------------------- | +| `xrpl_amm_tvl_drops` | Gauge | `pool=""` | Total value locked | +| `xrpl_amm_trading_fee` | Gauge | `pool=""` | Pool trading fee (bps) | +| `xrpl_orderbook_bid_depth` | Gauge | `pair=""` | Total bid volume | +| `xrpl_orderbook_ask_depth` | Gauge | `pair=""` | Total ask volume | +| `xrpl_orderbook_spread` | Gauge | `pair=""` | Best bid-ask spread | + +### Phase 9: OTel SDK-Exported Metrics (MetricsRegistry) + +Phase 9 introduces the `MetricsRegistry` class (`src/xrpld/telemetry/MetricsRegistry.h/.cpp`) +which registers metrics directly with the OpenTelemetry Metrics SDK. These are exported +via OTLP/HTTP to the OTel Collector and scraped by Prometheus. + +#### NodeStore I/O (Observable Gauge — `nodestore_state`) + +| Prometheus Metric | Type | Labels | Description | +| ------------------------------------------------------ | ----- | -------- | ------------------------------------ | +| `rippled_nodestore_state{metric="node_reads_total"}` | Gauge | `metric` | Cumulative NodeStore read operations | +| `rippled_nodestore_state{metric="node_reads_hit"}` | Gauge | `metric` | Reads served from cache | +| `rippled_nodestore_state{metric="node_writes"}` | Gauge | `metric` | Cumulative write operations | +| `rippled_nodestore_state{metric="node_written_bytes"}` | Gauge | `metric` | Cumulative bytes written | +| `rippled_nodestore_state{metric="node_read_bytes"}` | Gauge | `metric` | Cumulative bytes read | +| `rippled_nodestore_state{metric="write_load"}` | Gauge | `metric` | Current write load score | +| `rippled_nodestore_state{metric="read_queue"}` | Gauge | `metric` | Items in read prefetch queue | + +#### Cache Hit Rates & Sizes (Observable Gauge — `cache_metrics`) + +| Prometheus Metric | Type | Labels | Description | +| ----------------------------------------------------- | ----- | -------- | ----------------------------- | +| `rippled_cache_metrics{metric="SLE_hit_rate"}` | Gauge | `metric` | SLE cache hit rate (0.0-1.0) | +| `rippled_cache_metrics{metric="ledger_hit_rate"}` | Gauge | `metric` | Ledger cache hit rate | +| `rippled_cache_metrics{metric="AL_hit_rate"}` | Gauge | `metric` | AcceptedLedger cache hit rate | +| `rippled_cache_metrics{metric="treenode_cache_size"}` | Gauge | `metric` | SHAMap TreeNode cache entries | +| `rippled_cache_metrics{metric="treenode_track_size"}` | Gauge | `metric` | Tracked tree nodes | +| `rippled_cache_metrics{metric="fullbelow_size"}` | Gauge | `metric` | FullBelow cache entries | + +#### Transaction Queue (Observable Gauge — `txq_metrics`) + +| Prometheus Metric | Type | Labels | Description | +| ------------------------------------------------------------ | ----- | -------- | -------------------------------- | +| `rippled_txq_metrics{metric="txq_count"}` | Gauge | `metric` | Transactions currently in queue | +| `rippled_txq_metrics{metric="txq_max_size"}` | Gauge | `metric` | Maximum queue capacity | +| `rippled_txq_metrics{metric="txq_in_ledger"}` | Gauge | `metric` | Transactions in open ledger | +| `rippled_txq_metrics{metric="txq_per_ledger"}` | Gauge | `metric` | Expected transactions per ledger | +| `rippled_txq_metrics{metric="txq_reference_fee_level"}` | Gauge | `metric` | Reference fee level | +| `rippled_txq_metrics{metric="txq_min_processing_fee_level"}` | Gauge | `metric` | Minimum fee to get processed | +| `rippled_txq_metrics{metric="txq_med_fee_level"}` | Gauge | `metric` | Median fee level in queue | +| `rippled_txq_metrics{metric="txq_open_ledger_fee_level"}` | Gauge | `metric` | Open ledger fee escalation level | + +#### Per-RPC Method Metrics (Synchronous Counters/Histogram) + +| Prometheus Metric | Type | Labels | Description | +| ----------------------------------- | --------- | ----------------- | -------------------------------- | +| `rippled_rpc_method_started_total` | Counter | `method=""` | RPC calls started | +| `rippled_rpc_method_finished_total` | Counter | `method=""` | RPC calls completed successfully | +| `rippled_rpc_method_errored_total` | Counter | `method=""` | RPC calls that errored | +| `rippled_rpc_method_duration_us` | Histogram | `method=""` | Execution time distribution (us) | + +#### Per-Job-Type Metrics (Synchronous Counters/Histogram) + +| Prometheus Metric | Type | Labels | Description | +| --------------------------------- | --------- | ------------------- | --------------------------------- | +| `rippled_job_queued_total` | Counter | `job_type=""` | Jobs enqueued | +| `rippled_job_started_total` | Counter | `job_type=""` | Jobs started | +| `rippled_job_finished_total` | Counter | `job_type=""` | Jobs completed | +| `rippled_job_queued_duration_us` | Histogram | `job_type=""` | Queue wait time distribution (us) | +| `rippled_job_running_duration_us` | Histogram | `job_type=""` | Execution time distribution (us) | + +#### Counted Object Instances (Observable Gauge — `object_count`) + +| Prometheus Metric | Type | Labels | Description | +| ---------------------------------------------- | ----- | --------------- | ------------------------------ | +| `rippled_object_count{type="Transaction"}` | Gauge | `type=""` | Live Transaction objects | +| `rippled_object_count{type="Ledger"}` | Gauge | `type=""` | Live Ledger objects | +| `rippled_object_count{type="NodeObject"}` | Gauge | `type=""` | Live NodeObject instances | +| `rippled_object_count{type="STTx"}` | Gauge | `type=""` | Serialized transaction objects | +| `rippled_object_count{type="STLedgerEntry"}` | Gauge | `type=""` | Serialized ledger entries | +| `rippled_object_count{type="InboundLedger"}` | Gauge | `type=""` | Ledgers being fetched | +| `rippled_object_count{type="Pathfinder"}` | Gauge | `type=""` | Active pathfinding operations | +| `rippled_object_count{type="PathRequest"}` | Gauge | `type=""` | Active path requests | +| `rippled_object_count{type="HashRouterEntry"}` | Gauge | `type=""` | Hash router entries | + +#### Load Factor Breakdown (Observable Gauge — `load_factor_metrics`) + +| Prometheus Metric | Type | Labels | Description | +| ------------------------------------------------------------------ | ----- | -------- | --------------------------------------- | +| `rippled_load_factor_metrics{metric="load_factor"}` | Gauge | `metric` | Combined transaction cost multiplier | +| `rippled_load_factor_metrics{metric="load_factor_server"}` | Gauge | `metric` | Server + cluster + network contribution | +| `rippled_load_factor_metrics{metric="load_factor_local"}` | Gauge | `metric` | Local server load only | +| `rippled_load_factor_metrics{metric="load_factor_net"}` | Gauge | `metric` | Network-wide load estimate | +| `rippled_load_factor_metrics{metric="load_factor_cluster"}` | Gauge | `metric` | Cluster peer load | +| `rippled_load_factor_metrics{metric="load_factor_fee_escalation"}` | Gauge | `metric` | Open ledger fee escalation | +| `rippled_load_factor_metrics{metric="load_factor_fee_queue"}` | Gauge | `metric` | Queue entry fee level | + +#### Prometheus Query Examples (Phase 9) + +```promql +# NodeStore cache hit ratio +rippled_nodestore_state{metric="node_reads_hit"} / rippled_nodestore_state{metric="node_reads_total"} + +# RPC error rate for server_info +rate(rippled_rpc_method_errored_total{method="server_info"}[5m]) + +# Job queue wait time p95 +histogram_quantile(0.95, sum by (le) (rate(rippled_job_queued_duration_us_bucket[5m]))) + +# TxQ utilization percentage +rippled_txq_metrics{metric="txq_count"} / rippled_txq_metrics{metric="txq_max_size"} + +# High load factor alert candidate +rippled_load_factor_metrics{metric="load_factor"} > 5 +``` + +### New Grafana Dashboards (Phase 9) + +| Dashboard | UID | Data Source | Key Panels | +| ---------------------- | -------------------- | ----------- | --------------------------------------------------------- | +| Fee Market & TxQ | `rippled-fee-market` | Prometheus | TxQ depth/capacity, fee levels, load factor breakdown | +| Job Queue Analysis | `rippled-job-queue` | Prometheus | Per-job rates, queue wait times, execution times | +| RPC Performance (OTel) | `rippled-rpc-perf` | Prometheus | Per-method call rates, error rates, latency distributions | + +### Updated Grafana Dashboards (Phase 9) + +| Dashboard | UID | New Panels Added | +| -------------------- | ---------------------------- | ------------------------------------------------------ | +| Node Health (StatsD) | `rippled-statsd-node-health` | NodeStore I/O, cache hit rates, object instance counts | + +### New Grafana Dashboards (Phase 11) + +| Dashboard | UID | Data Source | Key Panels | +| ------------------ | ----------------------------- | ----------- | ---------------------------------------------------------------------- | +| Validator Health | `rippled-validator-health` | Prometheus | Server state timeline, proposer count, converge time, amendment voting | +| Network Topology | `rippled-network-topology` | Prometheus | Peer count, version distribution, latency distribution, diverged peers | +| Fee Market (Ext) | `rippled-fee-market-external` | Prometheus | Fee levels, queue depth, load factor breakdown, escalation timeline | +| DEX & AMM Overview | `rippled-dex-amm` | Prometheus | AMM TVL, order book depth, spread trends, trading fee revenue | + +### Prometheus Alerting Rules (Phase 11) + +| Alert Name | Severity | Condition | For | +| ---------------------------------- | -------- | ----------------------------------------------------------- | --- | +| `XRPLServerNotFull` | Critical | `xrpl_server_state < 4` for 15m | 15m | +| `XRPLAmendmentBlocked` | Critical | `xrpl_amendment_blocked == 1` | 1m | +| `XRPLNoPeers` | Critical | `xrpl_peers_count == 0` | 5m | +| `XRPLLedgerStale` | Critical | `xrpl_validated_ledger_age_seconds > 120` | 2m | +| `XRPLHighIOLatency` | Critical | `xrpl_io_latency_ms > 100` | 5m | +| `XRPLUnsupportedAmendmentMajority` | Critical | `xrpl_amendment_unsupported_majority == 1` | 1m | +| `XRPLLowPeerCount` | Warning | `xrpl_peers_count < 10` | 15m | +| `XRPLHighLoadFactor` | Warning | `xrpl_load_factor > 10` | 10m | +| `XRPLSlowConsensus` | Warning | `xrpl_last_close_converge_time_seconds > 6` | 5m | +| `XRPLValidatorListExpiring` | Warning | `(xrpl_validator_list_expiration_seconds - time()) < 86400` | 1h | +| `XRPLStateFlapping` | Warning | `rate(xrpl_state_transitions_total{state="full"}[1h]) > 2` | 30m | + +--- + ## 6. Known Issues | Issue | Impact | Status | diff --git a/OpenTelemetryPlan/Phase10_taskList.md b/OpenTelemetryPlan/Phase10_taskList.md new file mode 100644 index 0000000000..80a3603ffc --- /dev/null +++ b/OpenTelemetryPlan/Phase10_taskList.md @@ -0,0 +1,242 @@ +# Phase 10: Synthetic Workload Generation & Telemetry Validation — Task List + +> **Status**: Future Enhancement +> +> **Goal**: Build tools that generate realistic XRPL traffic to validate the full Phases 1-9 telemetry stack end-to-end — all spans, attributes, metrics, dashboards, and log-trace correlation — under controlled load. +> +> **Scope**: Python/shell test harness + multi-node docker-compose environment + automated validation scripts + performance benchmarks. +> +> **Branch**: `pratik/otel-phase10-workload-validation` (from `pratik/otel-phase9-metric-gap-fill`) +> +> **Depends on**: Phase 9 (internal metric gap fill) — validates the full metric surface + +### Related Plan Documents + +| Document | Relevance | +| -------------------------------------------------------------------- | --------------------------------------------------------------- | +| [06-implementation-phases.md](./06-implementation-phases.md) | Phase 10 plan: motivation, architecture, exit criteria (§6.8.3) | +| [09-data-collection-reference.md](./09-data-collection-reference.md) | Defines the full inventory of spans/metrics to validate | +| [Phase9_taskList.md](./Phase9_taskList.md) | Prerequisite — all internal metrics must be emitting | + +### Why This Phase Exists + +Before Phases 1-9 can be considered production-ready, we need proof that: + +1. All 16 spans fire with correct attributes under real transaction workloads +2. All 255+ StatsD metrics + ~50 Phase 9 metrics appear in Prometheus with non-zero values +3. Log-trace correlation (Phase 8) produces clickable trace_id links in Loki +4. All 10 Grafana dashboards render meaningful data (no empty panels) +5. Performance overhead stays within bounds (< 3% CPU, < 5MB memory) +6. The telemetry stack survives sustained load without data loss or queue backpressure + +--- + +## Task 10.1: Multi-Node Test Harness + +**Objective**: Create a docker-compose environment with 3-5 validator nodes that produces real consensus rounds. + +**What to do**: + +- Create `docker/telemetry/docker-compose.workload.yaml`: + - 5 rippled validator nodes with UNL configured for each other + - All telemetry enabled: `[telemetry] enabled=1`, `[insight] server=otel` + - Full OTel stack: Collector, Jaeger, Tempo, Prometheus, Loki, Grafana + - Shared network with service discovery + +- Each node should: + - Generate validator keys at startup + - Configure all 5 nodes in its UNL + - Enable all trace categories including `trace_peer=1` + - Write logs to a file tailed by the OTel Collector filelog receiver + +- Include a `Makefile` target: `make telemetry-workload-up` / `make telemetry-workload-down` + +**Key files**: + +- New: `docker/telemetry/docker-compose.workload.yaml` +- New: `docker/telemetry/workload/generate-validator-keys.sh` +- New: `docker/telemetry/workload/xrpld-validator.cfg.template` + +--- + +## Task 10.2: RPC Load Generator + +**Objective**: Configurable tool that fires all traced RPC commands at controlled rates. + +**What to do**: + +- Create `docker/telemetry/workload/rpc_load_generator.py`: + - Connects to one or more rippled WebSocket endpoints + - Fires all RPC commands that have trace spans: `server_info`, `ledger`, `tx`, `account_info`, `account_lines`, `fee`, `submit`, etc. + - Configurable parameters: rate (RPS), duration, command distribution weights + - Injects `traceparent` HTTP headers to test W3C context propagation + - Logs progress and errors to stdout + +- Command distribution should match realistic production ratios: + - 40% `server_info` / `fee` (health checks) + - 30% `account_info` / `account_lines` / `account_objects` (wallet queries) + - 15% `ledger` / `ledger_data` (explorer queries) + - 10% `tx` / `account_tx` (transaction lookups) + - 5% `book_offers` / `amm_info` (DEX queries) + +**Key files**: + +- New: `docker/telemetry/workload/rpc_load_generator.py` +- New: `docker/telemetry/workload/requirements.txt` + +--- + +## Task 10.3: Transaction Submitter + +**Objective**: Generate diverse transaction types to exercise `tx.*` and `ledger.*` spans. + +**What to do**: + +- Create `docker/telemetry/workload/tx_submitter.py`: + - Pre-funds test accounts from genesis account + - Submits a mix of transaction types: + - `Payment` (XRP and issued currencies) — exercises `tx.process`, `tx.apply` + - `OfferCreate` / `OfferCancel` — DEX activity + - `TrustSet` — trust line creation for issued currencies + - `NFTokenMint` / `NFTokenCreateOffer` / `NFTokenAcceptOffer` — NFT activity + - `EscrowCreate` / `EscrowFinish` — escrow lifecycle + - `AMMCreate` / `AMMDeposit` / `AMMWithdraw` — AMM pool operations (if amendment enabled) + - Configurable: TPS target, transaction mix weights, duration + - Monitors submission results and tracks success/failure rates + +- The transaction mix ensures the telemetry captures the full range of ledger activity that third parties care about. + +**Key files**: + +- New: `docker/telemetry/workload/tx_submitter.py` +- New: `docker/telemetry/workload/test_accounts.json` (pre-generated keypairs) + +--- + +## Task 10.4: Telemetry Validation Suite + +**Objective**: Automated scripts that verify all expected telemetry data exists after a workload run. + +**What to do**: + +- Create `docker/telemetry/workload/validate_telemetry.py`: + + **Span validation** (queries Jaeger/Tempo API): + - Assert all 16 span names appear in traces + - Assert each span has its required attributes (22 total attributes across spans) + - Assert parent-child relationships are correct (`rpc.request` → `rpc.process` → `rpc.command.*`) + - Assert span durations are reasonable (> 0, < 60s) + + **Metric validation** (queries Prometheus API): + - Assert all SpanMetrics-derived metrics are non-zero: `traces_span_metrics_calls_total`, `traces_span_metrics_duration_milliseconds_bucket` + - Assert all StatsD metrics are non-zero: `rippled_LedgerMaster_Validated_Ledger_Age`, `rippled_Peer_Finder_Active_*`, etc. + - Assert all Phase 9 metrics are non-zero: `rippled_nodestore_*`, `rippled_cache_*`, `rippled_txq_*`, `rippled_rpc_method_*`, `rippled_object_count`, `rippled_load_factor*` + - Assert metric label cardinality is within bounds + + **Log-trace correlation validation** (queries Loki API): + - Assert logs contain `trace_id=` and `span_id=` fields + - Pick a random trace_id from Jaeger → query Loki for matching logs → assert results exist + - Assert Grafana derived field links are functional + + **Dashboard validation**: + - For each of the 10 Grafana dashboards, query the dashboard API and assert no panels show "No data" + +- Output: JSON report with pass/fail per check, suitable for CI. + +**Key files**: + +- New: `docker/telemetry/workload/validate_telemetry.py` +- New: `docker/telemetry/workload/expected_spans.json` (span inventory for validation) +- New: `docker/telemetry/workload/expected_metrics.json` (metric inventory for validation) + +--- + +## Task 10.5: Performance Benchmark Suite + +**Objective**: Measure CPU/memory/latency overhead of the telemetry stack. + +**What to do**: + +- Create `docker/telemetry/workload/benchmark.sh`: + - **Baseline run**: Start cluster with `[telemetry] enabled=0`, run transaction workload for 5 minutes, record metrics + - **Telemetry run**: Start cluster with full telemetry enabled, run identical workload, record metrics + - **Comparison**: Calculate deltas for: + - CPU usage (per-node average) + - Memory RSS (per-node peak) + - RPC p99 latency + - Transaction throughput (TPS) + - Consensus round time p95 + - Ledger close time p95 + +- Output: Markdown table comparing baseline vs. telemetry, with pass/fail against targets: + - CPU overhead < 3% + - Memory overhead < 5MB + - RPC latency impact < 2ms p99 + - Throughput impact < 5% + - Consensus impact < 1% + +- Store results in `docker/telemetry/workload/benchmark-results/` for historical tracking. + +**Key files**: + +- New: `docker/telemetry/workload/benchmark.sh` +- New: `docker/telemetry/workload/collect_system_metrics.sh` + +--- + +## Task 10.6: CI Integration + +**Objective**: Wire the validation suite into CI for regression detection. + +**What to do**: + +- Create a CI workflow (GitHub Actions or equivalent) that: + 1. Builds rippled with `-DXRPL_ENABLE_TELEMETRY=ON` + 2. Starts the multi-node workload harness + 3. Runs the RPC load generator + transaction submitter for 2 minutes + 4. Runs the validation suite + 5. Runs the benchmark suite + 6. Fails the build if any validation check fails or benchmark exceeds thresholds + 7. Archives the validation report and benchmark results as artifacts + +- This should be a separate workflow (not part of the main CI), triggered manually or on telemetry-related branch changes. + +**Key files**: + +- New: `.github/workflows/telemetry-validation.yml` +- New: `docker/telemetry/workload/run-full-validation.sh` (orchestrator script) + +--- + +## Task 10.7: Documentation + +**Objective**: Document the workload tools and validation process. + +**What to do**: + +- Create `docker/telemetry/workload/README.md`: + - Quick start guide for running workload harness + - Configuration options for load generator and tx submitter + - How to read validation reports + - How to run benchmarks and interpret results + +- Update `docs/telemetry-runbook.md`: + - Add "Validating Telemetry Stack" section + - Add "Performance Benchmarking" section + +- Update `OpenTelemetryPlan/09-data-collection-reference.md`: + - Add "Validation" section with expected metric/span counts + +--- + +## Exit Criteria + +- [ ] 5-node validator cluster starts and reaches consensus in docker-compose +- [ ] RPC load generator fires all traced RPC commands at configurable rates +- [ ] Transaction submitter generates 6+ transaction types at configurable TPS +- [ ] Validation suite confirms all 16 spans, 22 attributes, 300+ metrics are present +- [ ] Log-trace correlation validated end-to-end (Loki ↔ Tempo) +- [ ] All 10 Grafana dashboards render data (no empty panels) +- [ ] Benchmark shows < 3% CPU overhead, < 5MB memory overhead +- [ ] CI workflow runs validation on telemetry branch changes +- [ ] Validation report output is CI-parseable (JSON with exit codes) diff --git a/OpenTelemetryPlan/Phase11_taskList.md b/OpenTelemetryPlan/Phase11_taskList.md new file mode 100644 index 0000000000..7743950cda --- /dev/null +++ b/OpenTelemetryPlan/Phase11_taskList.md @@ -0,0 +1,453 @@ +# Phase 11: Third-Party Data Collection Pipelines — Task List + +> **Status**: Future Enhancement +> +> **Goal**: Build a custom OTel Collector receiver that periodically polls rippled's admin RPCs and exports structured metrics for external consumers — making all XRPL health, validator, peer, fee, and DEX data available as Prometheus/OTLP metrics without rippled code changes. +> +> **Scope**: Go-based OTel Collector receiver plugin + Grafana dashboards + Prometheus alerting rules. +> +> **Branch**: `pratik/otel-phase11-third-party-collection` (from `pratik/otel-phase10-workload-validation`) +> +> **Depends on**: Phase 10 (validation harness for testing the new receiver) + +### Related Plan Documents + +| Document | Relevance | +| -------------------------------------------------------------------- | --------------------------------------------------------------- | +| [06-implementation-phases.md](./06-implementation-phases.md) | Phase 11 plan: motivation, architecture, exit criteria (§6.8.4) | +| [09-data-collection-reference.md](./09-data-collection-reference.md) | Defines full metric inventory including third-party metrics | +| [Phase10_taskList.md](./Phase10_taskList.md) | Prerequisite — validation harness for testing | + +### Third-Party Consumer Gap Analysis + +This phase addresses the cross-cutting gap identified during research: **rippled has no native Prometheus/OTLP metrics export for data accessible only via RPC**. Every consumer (exchanges, payment processors, analytics providers, validators, researchers, compliance firms, custodians) must build custom JSON-RPC polling and conversion. This receiver centralizes that work. + +| Consumer Category | Data Unlocked by This Phase | +| -------------------------- | ------------------------------------------------------------------ | +| **Exchanges** | Real-time fee estimates, TxQ capacity, server health scores | +| **Payment Processors** | Settlement latency percentiles, corridor health, path availability | +| **Analytics Providers** | Validator metrics, network topology, amendment voting status | +| **DeFi / AMM** | AMM pool TVL, DEX order book depth, trade volumes | +| **Validators / Operators** | Per-peer latency, version distribution, UNL health, alerting | +| **Compliance** | Transaction volume trends, network growth metrics | +| **Academic Researchers** | Consensus performance time-series, decentralization metrics | +| **CBDC / Tokenization** | Token supply tracking, trust line adoption, freeze status | +| **Institutional Custody** | Multi-sig status, escrow tracking, reserve calculations | +| **Wallet Providers** | Server health for node selection, fee prediction data | + +--- + +## Task 11.1: OTel Collector Receiver Scaffold + +**Objective**: Create the Go project structure for a custom OTel Collector receiver that polls rippled JSON-RPC. + +**What to do**: + +- Create `docker/telemetry/otel-rippled-receiver/`: + - `receiver.go` — implements `receiver.Metrics` interface + - `config.go` — configuration struct (endpoint, poll interval, enabled RPCs) + - `factory.go` — receiver factory registration + - `go.mod` / `go.sum` — Go module with OTel Collector SDK dependency + +- Configuration model: + + ```yaml + rippled_receiver: + endpoint: "http://localhost:5005" # rippled admin RPC + poll_interval: 30s # how often to poll + enabled_collectors: + - server_info + - get_counts + - fee + - peers + - validators + - feature + - server_state + amm_pools: [] # optional: AMM pool IDs to track + book_offers_pairs: [] # optional: currency pairs for DEX depth + ``` + +- Build a custom OTel Collector binary that includes this receiver alongside the standard receivers. + +**Key files**: + +- New: `docker/telemetry/otel-rippled-receiver/receiver.go` +- New: `docker/telemetry/otel-rippled-receiver/config.go` +- New: `docker/telemetry/otel-rippled-receiver/factory.go` +- New: `docker/telemetry/otel-rippled-receiver/go.mod` +- New: `docker/telemetry/otel-rippled-receiver/Dockerfile` + +--- + +## Task 11.2: server_info / server_state Collector + +**Objective**: Poll `server_info` and `server_state` and export all fields as OTel metrics. + +**What to do**: + +- Implement `serverInfoCollector` that calls `server_info` (admin) and extracts: + + **Node Health Gauges:** + - `xrpl_server_state` (enum → int: disconnected=0, connected=1, syncing=2, tracking=3, full=4, proposing=5) + - `xrpl_server_state_duration_seconds` + - `xrpl_uptime_seconds` + - `xrpl_io_latency_ms` + - `xrpl_amendment_blocked` (0 or 1) + - `xrpl_peers_count` + - `xrpl_peer_disconnects_total` + - `xrpl_peer_disconnects_resources_total` + - `xrpl_jq_trans_overflow_total` + + **Consensus Gauges:** + - `xrpl_last_close_proposers` + - `xrpl_last_close_converge_time_seconds` + - `xrpl_validation_quorum` + + **Ledger Gauges:** + - `xrpl_validated_ledger_seq` + - `xrpl_validated_ledger_age_seconds` + - `xrpl_validated_ledger_base_fee_drops` + - `xrpl_validated_ledger_reserve_base_drops` + - `xrpl_validated_ledger_reserve_inc_drops` + - `xrpl_close_time_offset_seconds` (0 when absent) + + **Load Factor Gauges:** + - `xrpl_load_factor` + - `xrpl_load_factor_server` + - `xrpl_load_factor_fee_escalation` + - `xrpl_load_factor_fee_queue` + - `xrpl_load_factor_local` + - `xrpl_load_factor_net` + - `xrpl_load_factor_cluster` + + **State Accounting Gauges** (per state: disconnected, connected, syncing, tracking, full): + - `xrpl_state_duration_seconds{state=""}` + - `xrpl_state_transitions_total{state=""}` + + **Validator Info** (when node is a validator): + - `xrpl_validator_list_count` + - `xrpl_validator_list_expiration_seconds` (epoch) + - `xrpl_validator_list_active` (0 or 1) + +**Key files**: + +- New: `docker/telemetry/otel-rippled-receiver/collectors/server_info.go` + +--- + +## Task 11.3: get_counts Collector + +**Objective**: Poll `get_counts` and export internal object counts and NodeStore stats. + +**What to do**: + +- Implement `getCountsCollector`: + + **Database Gauges:** + - `xrpl_db_size_kb{db="total"}`, `xrpl_db_size_kb{db="ledger"}`, `xrpl_db_size_kb{db="transaction"}` + + **NodeStore Gauges:** + - `xrpl_nodestore_reads_total`, `xrpl_nodestore_reads_hit`, `xrpl_nodestore_writes_total` + - `xrpl_nodestore_read_bytes`, `xrpl_nodestore_written_bytes` + - `xrpl_nodestore_read_duration_us`, `xrpl_nodestore_write_load` + - `xrpl_nodestore_read_queue`, `xrpl_nodestore_read_threads_running` + + **Cache Gauges:** + - `xrpl_cache_hit_rate{cache="SLE"}`, `xrpl_cache_hit_rate{cache="ledger"}`, `xrpl_cache_hit_rate{cache="accepted_ledger"}` + - `xrpl_cache_size{cache="treenode"}`, `xrpl_cache_size{cache="fullbelow"}`, `xrpl_cache_size{cache="accepted_ledger"}` + + **Object Count Gauges:** + - `xrpl_object_count{type=""}` for each counted object type (Transaction, Ledger, NodeObject, STTx, STLedgerEntry, InboundLedger, Pathfinder, etc.) + + **Rates:** + - `xrpl_historical_fetch_per_minute` + - `xrpl_local_txs` + +**Key files**: + +- New: `docker/telemetry/otel-rippled-receiver/collectors/get_counts.go` + +--- + +## Task 11.4: Peer Topology Collector + +**Objective**: Poll `peers` and export per-peer and aggregate network metrics. + +**What to do**: + +- Implement `peersCollector`: + + **Aggregate Gauges:** + - `xrpl_peers_inbound_count` + - `xrpl_peers_outbound_count` + - `xrpl_peers_cluster_count` + + **Per-Peer Gauges** (with labels `peer_key` truncated to 8 chars for cardinality control): + - `xrpl_peer_latency_ms{peer="", version="", inbound=""}` + - `xrpl_peer_uptime_seconds{peer=""}` + - `xrpl_peer_load{peer=""}` + + **Distribution Gauges** (aggregated across all peers): + - `xrpl_peer_latency_p50_ms`, `xrpl_peer_latency_p95_ms`, `xrpl_peer_latency_p99_ms` + - `xrpl_peer_version_count{version=""}` — count of peers per software version + + **Tracking Status:** + - `xrpl_peer_diverged_count` — peers with `track=diverged` + - `xrpl_peer_unknown_count` — peers with `track=unknown` + +**Key files**: + +- New: `docker/telemetry/otel-rippled-receiver/collectors/peers.go` + +**Cardinality note**: Per-peer metrics use truncated keys. For large peer sets (50+), the aggregate distribution gauges are preferred over per-peer labels. + +--- + +## Task 11.5: Validator & Amendment Collector + +**Objective**: Poll `validators` and `feature` to export validator health and amendment voting status. + +**What to do**: + +- Implement `validatorCollector`: + + **From `validators` RPC:** + - `xrpl_trusted_validators_count` + - `xrpl_validator_signing` (0 or 1 — whether local validator is signing) + + **From `feature` RPC:** + - `xrpl_amendment_enabled_count` — total enabled amendments + - `xrpl_amendment_majority_count` — amendments with majority but not yet enabled + - `xrpl_amendment_vetoed_count` — locally vetoed amendments + - `xrpl_amendment_unsupported_majority` (0 or 1) — any unsupported amendment has majority (critical alert) + + **Per-amendment with majority** (limited cardinality — only amendments with `majority` set): + - `xrpl_amendment_majority_time{name=""}` — epoch time when majority was gained + - `xrpl_amendment_votes{name=""}` — current vote count + - `xrpl_amendment_threshold{name=""}` — votes needed + +**Key files**: + +- New: `docker/telemetry/otel-rippled-receiver/collectors/validators.go` + +--- + +## Task 11.6: Fee & TxQ Collector + +**Objective**: Poll `fee` RPC and export real-time fee market data. + +**What to do**: + +- Implement `feeCollector` that calls the public `fee` RPC: + + **Fee Level Gauges:** + - `xrpl_fee_current_ledger_size` — transactions in current open ledger + - `xrpl_fee_expected_ledger_size` — expected transactions at close + - `xrpl_fee_max_queue_size` — maximum transaction queue size + - `xrpl_fee_open_ledger_fee_drops` — minimum fee for open ledger inclusion + - `xrpl_fee_median_fee_drops` — median fee level + - `xrpl_fee_minimum_fee_drops` — base reference fee + - `xrpl_fee_queue_size` — current queue depth + +- This overlaps with Phase 9's internal TxQ metrics but provides an external-only collection path that doesn't require rippled code changes. + +**Key files**: + +- New: `docker/telemetry/otel-rippled-receiver/collectors/fee.go` + +--- + +## Task 11.7: DEX & AMM Collector (Optional) + +**Objective**: Periodically poll configured AMM pools and order book pairs for DeFi metrics. + +**What to do**: + +- Implement `dexCollector` (enabled only when `amm_pools` or `book_offers_pairs` are configured): + + **AMM Pool Gauges** (per configured pool): + - `xrpl_amm_reserve{pool="", asset=""}` — pool reserve amount + - `xrpl_amm_lp_token_supply{pool=""}` — outstanding LP tokens + - `xrpl_amm_trading_fee{pool=""}` — pool trading fee (basis points) + - `xrpl_amm_tvl_drops{pool=""}` — total value locked (XRP-denominated) + + **Order Book Gauges** (per configured pair): + - `xrpl_orderbook_bid_depth{pair="/"}` — total bid volume + - `xrpl_orderbook_ask_depth{pair="/"}` — total ask volume + - `xrpl_orderbook_spread{pair="/"}` — best bid-ask spread + - `xrpl_orderbook_offer_count{pair="/", side="bid|ask"}` — number of offers + +**Key files**: + +- New: `docker/telemetry/otel-rippled-receiver/collectors/dex.go` + +**Note**: This is optional because it requires explicit configuration of which pools/pairs to track. Default configuration tracks no DEX data. + +--- + +## Task 11.8: Prometheus Alerting Rules + +**Objective**: Create production-ready alerting rules for the metrics exported by this receiver. + +**What to do**: + +- Create `docker/telemetry/prometheus/rippled-alerts.yml`: + + **Tier 1 — Critical (page immediately):** + + ```yaml + - alert: XRPLServerNotFull + expr: xrpl_server_state < 4 + for: 15m + + - alert: XRPLAmendmentBlocked + expr: xrpl_amendment_blocked == 1 + for: 1m + + - alert: XRPLNoPeers + expr: xrpl_peers_count == 0 + for: 5m + + - alert: XRPLLedgerStale + expr: xrpl_validated_ledger_age_seconds > 120 + for: 2m + + - alert: XRPLHighIOLatency + expr: xrpl_io_latency_ms > 100 + for: 5m + + - alert: XRPLUnsupportedAmendmentMajority + expr: xrpl_amendment_unsupported_majority == 1 + for: 1m + ``` + + **Tier 2 — Warning (investigate within hours):** + + ```yaml + - alert: XRPLLowPeerCount + expr: xrpl_peers_count < 10 + for: 15m + + - alert: XRPLHighLoadFactor + expr: xrpl_load_factor > 10 + for: 10m + + - alert: XRPLSlowConsensus + expr: xrpl_last_close_converge_time_seconds > 6 + for: 5m + + - alert: XRPLValidatorListExpiring + expr: (xrpl_validator_list_expiration_seconds - time()) < 86400 + for: 1h + + - alert: XRPLClockDrift + expr: xrpl_close_time_offset_seconds > 0 + for: 5m + + - alert: XRPLStateFlapping + expr: rate(xrpl_state_transitions_total{state="full"}[1h]) > 2 + for: 30m + ``` + +**Key files**: + +- New: `docker/telemetry/prometheus/rippled-alerts.yml` +- Update: `docker/telemetry/prometheus/prometheus.yml` (add rule_files reference) + +--- + +## Task 11.9: New Grafana Dashboards + +**Objective**: Create 4 new dashboards for the data exported by the receiver. + +**What to do**: + +- **Validator Health** (`rippled-validator-health`): + - Server state timeline, state duration breakdown + - Proposer count trend, converge time trend, validation quorum + - Validator list expiration countdown + - Amendment voting status (majority/enabled/vetoed) + +- **Network Topology** (`rippled-network-topology`): + - Peer count (inbound/outbound/cluster), peer version distribution + - Peer latency distribution (p50/p95/p99), diverged peer count + - Geographic distribution (if enriched with GeoIP) + - Peer uptime distribution + +- **Fee Market** (`rippled-fee-market-external`): + - Current fee levels (open ledger, median, minimum), fee escalation timeline + - Queue depth vs. capacity, transactions per ledger + - Load factor breakdown (server/network/cluster/escalation) + +- **DEX & AMM Overview** (`rippled-dex-amm`) (only populated when DEX collectors are configured): + - AMM pool TVL, reserve ratios, LP token supply + - Order book depth per pair, spread trends + - Trading fee revenue estimates + +**Key files**: + +- New: `docker/telemetry/grafana/dashboards/rippled-validator-health.json` +- New: `docker/telemetry/grafana/dashboards/rippled-network-topology.json` +- New: `docker/telemetry/grafana/dashboards/rippled-fee-market-external.json` +- New: `docker/telemetry/grafana/dashboards/rippled-dex-amm.json` + +--- + +## Task 11.10: Integration with Phase 10 Validation + +**Objective**: Extend the Phase 10 validation suite to verify this receiver's metrics. + +**What to do**: + +- Update `docker/telemetry/workload/validate_telemetry.py`: + - Add assertions for all `xrpl_*` metrics produced by the receiver + - Verify metric labels have expected values + - Verify alerting rules fire correctly (inject a "bad" state and check alert) + +- Update `docker/telemetry/docker-compose.workload.yaml`: + - Add the custom OTel Collector build with the rippled receiver + - Configure the receiver to poll one of the test nodes + +**Key files**: + +- Update: `docker/telemetry/workload/validate_telemetry.py` +- Update: `docker/telemetry/docker-compose.workload.yaml` +- Update: `docker/telemetry/workload/expected_metrics.json` + +--- + +## Task 11.11: Documentation + +**Objective**: Document the receiver, its metrics, deployment, and alerting. + +**What to do**: + +- Create `docker/telemetry/otel-rippled-receiver/README.md`: + - Architecture overview (how the receiver fits into the OTel Collector) + - Configuration reference (all config options with defaults) + - Metric reference table (all exported metrics with types and labels) + - Deployment guide (building custom collector binary, docker-compose integration) + +- Update `OpenTelemetryPlan/09-data-collection-reference.md`: + - Add "Third-Party Metrics (OTel Collector Receiver)" section + - Add new Grafana dashboard reference (4 dashboards) + - Add alerting rules reference + +- Update `docs/telemetry-runbook.md`: + - Add "Third-Party Metrics Receiver" troubleshooting section + - Add alerting playbook (what to do for each Tier 1/Tier 2 alert) + +--- + +## Exit Criteria + +- [ ] Custom OTel Collector receiver builds and starts without errors +- [ ] All `xrpl_*` metrics from server_info, get_counts, peers, validators, fee appear in Prometheus +- [ ] Metrics update at configured poll interval (default 30s) +- [ ] 4 new Grafana dashboards operational with data +- [ ] Prometheus alerting rules fire correctly for simulated failure conditions +- [ ] DEX/AMM collector works when configured (optional — not required for base exit criteria) +- [ ] Phase 10 validation suite passes with receiver metrics included +- [ ] Receiver handles rippled restart/unavailability gracefully (no crash, logs warning, retries) +- [ ] Documentation complete: receiver README, metric reference, alerting playbook +- [ ] Go receiver has unit tests with >80% coverage diff --git a/OpenTelemetryPlan/Phase9_taskList.md b/OpenTelemetryPlan/Phase9_taskList.md new file mode 100644 index 0000000000..1b383592f9 --- /dev/null +++ b/OpenTelemetryPlan/Phase9_taskList.md @@ -0,0 +1,312 @@ +# Phase 9: Internal Metric Instrumentation Gap Fill — Task List + +> **Status**: Future Enhancement +> +> **Goal**: Instrument rippled to emit ~50+ metrics that exist in `get_counts`/`server_info`/TxQ/PerfLog but currently lack time-series export via the OTel or beast::insight pipelines. +> +> **Scope**: Hybrid approach — extend `beast::insight` for metrics near existing registrations, use OTel Metrics SDK `ObservableGauge` callbacks for new categories (TxQ, PerfLog, CountedObjects). +> +> **Branch**: `pratik/otel-phase9-metric-gap-fill` (from `pratik/otel-phase8-log-correlation`) +> +> **Depends on**: Phase 7 (native OTel metrics pipeline) and Phase 8 (log-trace correlation) + +### Related Plan Documents + +| Document | Relevance | +| -------------------------------------------------------------------- | -------------------------------------------------------------- | +| [06-implementation-phases.md](./06-implementation-phases.md) | Phase 9 plan: motivation, architecture, exit criteria (§6.8.2) | +| [09-data-collection-reference.md](./09-data-collection-reference.md) | Current metric inventory + future metrics section | +| [Phase7_taskList.md](./Phase7_taskList.md) | Prerequisite — OTel Metrics SDK and `OTelCollector` class | +| [Phase8_taskList.md](./Phase8_taskList.md) | Prerequisite — log-trace correlation | + +### Third-Party Consumer Context + +These metrics serve multiple external consumer categories identified during research: + +| Consumer Category | Key Metrics They Need | +| ------------------------- | --------------------------------------------------------------- | +| **Exchanges** | Fee escalation levels, TxQ depth, settlement latency | +| **Payment Processors** | Load factors, io_latency, transaction throughput | +| **Analytics Providers** | NodeStore I/O, cache hit rates, counted objects | +| **Validators/Operators** | Per-job execution times, PerfLog RPC counters, consensus timing | +| **Academic Researchers** | Consensus performance time-series, fee market dynamics | +| **Institutional Custody** | Server health scores, reserve calculations, node availability | + +--- + +## Task 9.1: NodeStore I/O Metrics + +**Objective**: Export node store read/write performance as time-series metrics. + +**What to do**: + +- In `src/libxrpl/nodestore/Database.cpp`, extend existing `beast::insight` registrations to add: + - Gauge: `node_reads_total` (cumulative read operations) + - Gauge: `node_reads_hit` (cache-served reads) + - Gauge: `node_writes` (cumulative write operations) + - Gauge: `node_written_bytes` (cumulative bytes written) + - Gauge: `node_read_bytes` (cumulative bytes read) + - Gauge: `node_reads_duration_us` (cumulative read time in microseconds) + - Gauge: `write_load` (current write load score) + - Gauge: `read_queue` (items in read queue) + +- These values are already computed in `Database::getCountsJson()` (line ~236). Wire the same counters to `beast::insight` hooks. + +**Key modified files**: + +- `src/libxrpl/nodestore/Database.cpp` +- `src/libxrpl/nodestore/Database.h` (add insight members) + +**Derived Prometheus metrics**: `rippled_nodestore_reads_total`, `rippled_nodestore_reads_hit`, `rippled_nodestore_write_load`, etc. + +**Grafana dashboard**: Add "NodeStore I/O" panel group to _Node Health_ dashboard. + +--- + +## Task 9.2: Cache Hit Rate Metrics + +**Objective**: Export SHAMap and ledger cache performance as time-series gauges. + +**What to do**: + +- Register OTel `ObservableGauge` callbacks (via Phase 7's `OTelCollector`) for: + - `SLE_hit_rate` — SLE cache hit rate (0.0–1.0) + - `ledger_hit_rate` — Ledger object cache hit rate + - `AL_hit_rate` — AcceptedLedger cache hit rate + - `treenode_cache_size` — SHAMap TreeNode cache size (entries) + - `treenode_track_size` — Tracked tree nodes + - `fullbelow_size` — FullBelow cache size + +- The callback should read from the same sources as `GetCounts.cpp` handler (line ~43). + +- Create a centralized `MetricsRegistry` class that holds all OTel async gauge registrations, polled at 10-second intervals by the `PeriodicMetricReader`. + +**Key modified files**: + +- New: `src/xrpld/telemetry/MetricsRegistry.h` / `.cpp` +- `src/xrpld/rpc/handlers/GetCounts.cpp` (extract shared access methods) +- `src/xrpld/app/main/Application.cpp` (register MetricsRegistry at startup) + +**Derived Prometheus metrics**: `rippled_cache_SLE_hit_rate`, `rippled_cache_ledger_hit_rate`, `rippled_cache_treenode_size`, etc. + +--- + +## Task 9.3: Transaction Queue (TxQ) Metrics + +**Objective**: Export TxQ depth, capacity, and fee escalation levels as time-series. + +**What to do**: + +- Register OTel `ObservableGauge` callbacks for TxQ state (from `TxQ.h` line ~143): + - `txq_count` — Current transactions in queue + - `txq_max_size` — Maximum queue capacity + - `txq_in_ledger` — Transactions in current open ledger + - `txq_per_ledger` — Expected transactions per ledger + - `txq_reference_fee_level` — Reference fee level + - `txq_min_processing_fee_level` — Minimum fee to get processed + - `txq_med_fee_level` — Median fee level in queue + - `txq_open_ledger_fee_level` — Open ledger fee escalation level + +- Add to the `MetricsRegistry` (Task 9.2). + +**Key modified files**: + +- `src/xrpld/telemetry/MetricsRegistry.cpp` (add TxQ callbacks) +- `src/xrpld/app/tx/detail/TxQ.h` (expose metrics accessor if needed) + +**Derived Prometheus metrics**: `rippled_txq_count`, `rippled_txq_max_size`, `rippled_txq_open_ledger_fee_level`, etc. + +**Grafana dashboard**: New _Fee Market & TxQ_ dashboard (`rippled-fee-market`). + +--- + +## Task 9.4: PerfLog Per-RPC Method Metrics + +**Objective**: Export per-RPC-method call counts and latency as OTel metrics. + +**What to do**: + +- Register OTel instruments for PerfLog RPC counters (from `PerfLogImp.cpp` line ~63): + - Counter: `rpc_method_started_total{method=""}` — calls started + - Counter: `rpc_method_finished_total{method=""}` — calls completed + - Counter: `rpc_method_errored_total{method=""}` — calls errored + - Histogram: `rpc_method_duration_us{method=""}` — execution time distribution + +- Use OTel `Counter` and `Histogram` instruments with `method` attribute label. + +- Hook into the existing PerfLog callback mechanism rather than adding new instrumentation points. + +**Key modified files**: + +- `src/xrpld/perflog/detail/PerfLogImp.cpp` (add OTel instrument updates alongside existing JSON counters) +- `src/xrpld/telemetry/MetricsRegistry.cpp` (register instruments) + +**Derived Prometheus metrics**: `rippled_rpc_method_started_total{method="server_info"}`, `rippled_rpc_method_duration_us_bucket{method="ledger"}`, etc. + +**Grafana dashboard**: Add "Per-Method RPC Breakdown" panel group to _RPC Performance_ dashboard. + +--- + +## Task 9.5: PerfLog Per-Job-Type Metrics + +**Objective**: Export per-job-type queue and execution metrics. + +**What to do**: + +- Register OTel instruments for PerfLog job counters: + - Counter: `job_queued_total{job_type=""}` — jobs queued + - Counter: `job_started_total{job_type=""}` — jobs started + - Counter: `job_finished_total{job_type=""}` — jobs completed + - Histogram: `job_queued_duration_us{job_type=""}` — time spent waiting in queue + - Histogram: `job_running_duration_us{job_type=""}` — execution time distribution + +- Hook into PerfLog's existing job tracking alongside Task 9.4. + +**Key modified files**: + +- `src/xrpld/perflog/detail/PerfLogImp.cpp` +- `src/xrpld/telemetry/MetricsRegistry.cpp` + +**Derived Prometheus metrics**: `rippled_job_queued_total{job_type="ledgerData"}`, `rippled_job_running_duration_us_bucket{job_type="transaction"}`, etc. + +**Grafana dashboard**: New _Job Queue Analysis_ dashboard (`rippled-job-queue`). + +--- + +## Task 9.6: Counted Object Instance Metrics + +**Objective**: Export live instance counts for key internal object types. + +**What to do**: + +- Register OTel `ObservableGauge` callbacks for `CountedObject` instance counts: + - `object_count{type="Transaction"}` — live Transaction objects + - `object_count{type="Ledger"}` — live Ledger objects + - `object_count{type="NodeObject"}` — live NodeObject instances + - `object_count{type="STTx"}` — serialized transaction objects + - `object_count{type="STLedgerEntry"}` — serialized ledger entries + - `object_count{type="InboundLedger"}` — ledgers being fetched + - `object_count{type="Pathfinder"}` — active pathfinding computations + - `object_count{type="PathRequest"}` — active path requests + - `object_count{type="HashRouterEntry"}` — hash router entries + +- The `CountedObject` template already tracks these via atomic counters. The callback just reads the current counts. + +**Key modified files**: + +- `src/xrpld/telemetry/MetricsRegistry.cpp` (add counted object callbacks) +- `include/xrpl/basics/CountedObject.h` (may need static accessor for iteration) + +**Derived Prometheus metrics**: `rippled_object_count{type="Transaction"}`, `rippled_object_count{type="NodeObject"}`, etc. + +**Grafana dashboard**: Add "Object Instance Counts" panel to _Node Health_ dashboard. + +--- + +## Task 9.7: Fee Escalation & Load Factor Metrics + +**Objective**: Export the full load factor breakdown as time-series. + +**What to do**: + +- Register OTel `ObservableGauge` callbacks for load factors (from `NetworkOPs.cpp` line ~2694): + - `load_factor` — combined transaction cost multiplier + - `load_factor_server` — server + cluster + network contribution + - `load_factor_local` — local server load only + - `load_factor_net` — network-wide load estimate + - `load_factor_cluster` — cluster peer load + - `load_factor_fee_escalation` — open ledger fee escalation + - `load_factor_fee_queue` — queue entry fee level + +- These overlap with some existing StatsD metrics but provide finer granularity (individual factor breakdown vs. combined value). + +**Key modified files**: + +- `src/xrpld/telemetry/MetricsRegistry.cpp` +- `src/xrpld/app/misc/NetworkOPs.cpp` (expose load factor accessors if needed) + +**Derived Prometheus metrics**: `rippled_load_factor`, `rippled_load_factor_fee_escalation`, etc. + +**Grafana dashboard**: Add "Load Factor Breakdown" panel to _Fee Market & TxQ_ dashboard. + +--- + +## Task 9.8: New Grafana Dashboards + +**Objective**: Create Grafana dashboards for the new metric categories. + +**What to do**: + +- Create 2 new dashboards: + 1. **Fee Market & TxQ** (`rippled-fee-market`) — TxQ depth/capacity, fee levels, load factor breakdown, fee escalation timeline + 2. **Job Queue Analysis** (`rippled-job-queue`) — Per-job-type rates, queue wait times, execution times, job queue depth + +- Update 2 existing dashboards: + 1. **Node Health** (`rippled-statsd-node-health`) — Add NodeStore I/O panels, cache hit rate panels, object instance counts + 2. **RPC Performance** (`rippled-rpc-perf`) — Add per-method RPC breakdown panels + +**Key modified files**: + +- New: `docker/telemetry/grafana/dashboards/rippled-fee-market.json` +- New: `docker/telemetry/grafana/dashboards/rippled-job-queue.json` +- `docker/telemetry/grafana/dashboards/rippled-statsd-node-health.json` +- `docker/telemetry/grafana/dashboards/rippled-rpc-perf.json` + +--- + +## Task 9.9: Update Documentation + +**Objective**: Update telemetry reference docs with all new metrics. + +**What to do**: + +- Update `OpenTelemetryPlan/09-data-collection-reference.md`: + - Add new section for OTel SDK-exported metrics (NodeStore, cache, TxQ, PerfLog, CountedObjects, load factors) + - Update Grafana dashboard reference table (add 2 new dashboards) + - Add Prometheus query examples for new metrics + +- Update `docs/telemetry-runbook.md`: + - Add alerting rules for new metrics (NodeStore write_load, TxQ capacity, cache hit rate degradation) + - Add troubleshooting entries for new metric categories + +**Key modified files**: + +- `OpenTelemetryPlan/09-data-collection-reference.md` +- `docs/telemetry-runbook.md` + +--- + +## Task 9.10: Integration Tests + +**Objective**: Verify all new metrics appear in Prometheus after a test workload. + +**What to do**: + +- Extend the existing telemetry integration test: + - Start rippled with `[telemetry] enabled=1` and `[insight] server=otel` + - Submit a batch of RPC calls and transactions + - Query Prometheus for each new metric family + - Assert non-zero values for: NodeStore reads, cache hit rates, TxQ count, PerfLog RPC counters, object counts, load factors + +- Add unit tests for the `MetricsRegistry` class: + - Verify callback registration and deregistration + - Verify metric values match `get_counts` JSON output + - Verify graceful behavior when telemetry is disabled + +**Key modified files**: + +- `src/test/telemetry/MetricsRegistry_test.cpp` (new) +- Existing integration test script (extend assertions) + +--- + +## Exit Criteria + +- [ ] All ~50 new metrics visible in Prometheus via OTLP pipeline +- [ ] `MetricsRegistry` class registers/deregisters cleanly with OTel SDK +- [ ] Async gauge callbacks execute at 10s intervals without performance impact +- [ ] 2 new Grafana dashboards operational (Fee Market, Job Queue) +- [ ] 2 existing dashboards updated with new panel groups +- [ ] Integration test validates all new metric families are non-zero +- [ ] No performance regression (< 0.5% CPU overhead from new callbacks) +- [ ] Documentation updated with full new metric inventory diff --git a/docker/telemetry/grafana/dashboards/rippled-fee-market.json b/docker/telemetry/grafana/dashboards/rippled-fee-market.json new file mode 100644 index 0000000000..85fb1aa102 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/rippled-fee-market.json @@ -0,0 +1,343 @@ +{ + "annotations": { + "list": [] + }, + "description": "Fee market dynamics: TxQ depth/capacity, fee escalation levels, and load factor breakdown. Sourced from OTel MetricsRegistry observable gauges (Phase 9).", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Transaction Queue Depth", + "description": "Current number of transactions waiting in the queue vs. maximum capacity. Sourced from MetricsRegistry txq_metrics observable gauge with metric=txq_count and metric=txq_max_size.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_count\"}", + "legendFormat": "Queue Depth [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_max_size\"}", + "legendFormat": "Max Capacity [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Transactions", + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Transactions Per Ledger", + "description": "Transactions in the current open ledger vs. expected per-ledger count. Sourced from txq_metrics with metric=txq_in_ledger and metric=txq_per_ledger.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_in_ledger\"}", + "legendFormat": "In Ledger [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_per_ledger\"}", + "legendFormat": "Expected Per Ledger [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Transactions", + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Fee Escalation Levels", + "description": "Fee levels that control transaction queue admission. Reference fee level is the baseline; open ledger fee level triggers escalation. Sourced from txq_metrics observable gauge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_reference_fee_level\"}", + "legendFormat": "Reference Fee Level [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_min_processing_fee_level\"}", + "legendFormat": "Min Processing Fee Level [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_med_fee_level\"}", + "legendFormat": "Median Fee Level [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_txq_metrics{exported_instance=~\"$node\", metric=\"txq_open_ledger_fee_level\"}", + "legendFormat": "Open Ledger Fee Level [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Fee Level", + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5, + "scaleDistribution": { + "type": "log", + "log": 2 + } + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Load Factor Breakdown", + "description": "Decomposed load factor components: server (max of local, net, cluster), fee escalation, fee queue, and combined. Values are unitless multipliers where 1.0 = no load. Sourced from load_factor_metrics observable gauge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor\"}", + "legendFormat": "Combined Load Factor [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_server\"}", + "legendFormat": "Server [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_fee_escalation\"}", + "legendFormat": "Fee Escalation [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_fee_queue\"}", + "legendFormat": "Fee Queue [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Multiplier", + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 2 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + } + }, + { + "title": "Load Factor Components", + "description": "Individual load factor contributors: local server load, network load, and cluster load. Only differ from 1.0 under load conditions. Sourced from load_factor_metrics observable gauge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_local\"}", + "legendFormat": "Local [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_net\"}", + "legendFormat": "Network [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_load_factor_metrics{exported_instance=~\"$node\", metric=\"load_factor_cluster\"}", + "legendFormat": "Cluster [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Multiplier", + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "otel", "fee-market"], + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Fee Market & TxQ", + "uid": "rippled-fee-market", + "version": 1 +} diff --git a/docker/telemetry/grafana/dashboards/rippled-job-queue.json b/docker/telemetry/grafana/dashboards/rippled-job-queue.json new file mode 100644 index 0000000000..e29b96f750 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/rippled-job-queue.json @@ -0,0 +1,395 @@ +{ + "annotations": { + "list": [] + }, + "description": "Job queue analysis: per-job-type throughput rates, queue wait times, and execution times. Sourced from OTel MetricsRegistry synchronous counters and histograms (Phase 9).", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Job Throughput Rate (Per Second)", + "description": "Rate of jobs queued, started, and finished across all job types. Computed as rate() over the OTel counter values. High queue rates with low finish rates indicate backlog.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(rippled_job_queued_total{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m]))", + "legendFormat": "Queued/s [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(rippled_job_started_total{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m]))", + "legendFormat": "Started/s [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(rippled_job_finished_total{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m]))", + "legendFormat": "Finished/s [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "axisLabel": "Operations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Job-Type Queued Rate", + "description": "Rate of jobs queued broken down by job_type label. Identifies which job types contribute most to queue activity.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, rate(rippled_job_queued_total{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m]))", + "legendFormat": "{{job_type}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5, + "axisLabel": "Operations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Job-Type Finish Rate", + "description": "Rate of jobs completing broken down by job_type. Compare with queued rate to identify backlog per type.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, rate(rippled_job_finished_total{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m]))", + "legendFormat": "{{job_type}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5, + "axisLabel": "Operations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Job Queue Wait Time (P50, P95, P99)", + "description": "Histogram quantiles for time jobs spend waiting in the queue before execution starts. High values indicate thread pool saturation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(rippled_job_queued_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", + "legendFormat": "P50 [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(rippled_job_queued_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", + "legendFormat": "P95 [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le, exported_instance) (rate(rippled_job_queued_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", + "legendFormat": "P99 [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "us", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5, + "axisLabel": "Duration (μs)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Job Execution Time (P50, P95, P99)", + "description": "Histogram quantiles for actual job execution time. High values indicate expensive operations or resource contention.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(rippled_job_running_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", + "legendFormat": "P50 [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(rippled_job_running_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", + "legendFormat": "P95 [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le, exported_instance) (rate(rippled_job_running_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m])))", + "legendFormat": "P99 [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "us", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5, + "axisLabel": "Duration (μs)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Job-Type Execution Time (P95)", + "description": "95th percentile execution time broken down by job type. Identifies the slowest job types.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, histogram_quantile(0.95, sum by (le, job_type, exported_instance) (rate(rippled_job_running_duration_us_bucket{exported_instance=~\"$node\", job_type=~\"$job_type\"}[5m]))))", + "legendFormat": "{{job_type}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "us", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5, + "axisLabel": "Duration (μs)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "otel", "job-queue"], + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "job_type", + "label": "Job Type", + "description": "Filter by job type", + "type": "query", + "query": "label_values(rippled_job_queued_total, job_type)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Job Queue Analysis", + "uid": "rippled-job-queue", + "version": 1 +} diff --git a/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json b/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json new file mode 100644 index 0000000000..577ff69783 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/rippled-rpc-perf.json @@ -0,0 +1,404 @@ +{ + "annotations": { + "list": [] + }, + "description": "Per-RPC-method performance: call rates, error rates, and latency distributions. Sourced from OTel MetricsRegistry synchronous counters and histograms (Phase 9).", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "RPC Call Rate (All Methods)", + "description": "Aggregate rate of RPC calls started, finished, and errored across all methods. Computed as rate() over OTel counters.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(rippled_rpc_method_started_total{exported_instance=~\"$node\", method=~\"$method\"}[5m]))", + "legendFormat": "Started/s [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(rippled_rpc_method_finished_total{exported_instance=~\"$node\", method=~\"$method\"}[5m]))", + "legendFormat": "Finished/s [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (exported_instance) (rate(rippled_rpc_method_errored_total{exported_instance=~\"$node\", method=~\"$method\"}[5m]))", + "legendFormat": "Errored/s [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "axisLabel": "Operations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Method Call Rate (Top 10)", + "description": "Per-method RPC call rate, showing the 10 most active methods. Useful for identifying hot paths.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, rate(rippled_rpc_method_started_total{exported_instance=~\"$node\", method=~\"$method\"}[5m]))", + "legendFormat": "{{method}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5, + "axisLabel": "Operations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Method Error Rate (Top 10)", + "description": "Per-method RPC error rate. Non-zero values warrant investigation. Common culprits: invalid parameters, resource exhaustion.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, rate(rippled_rpc_method_errored_total{exported_instance=~\"$node\", method=~\"$method\"}[5m]))", + "legendFormat": "{{method}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5, + "axisLabel": "Operations / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "RPC Latency (P50, P95, P99) - All Methods", + "description": "Histogram quantiles for RPC execution time across all methods. Sourced from rpc_method_duration_us histogram.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le, exported_instance) (rate(rippled_rpc_method_duration_us_bucket{exported_instance=~\"$node\", method=~\"$method\"}[5m])))", + "legendFormat": "P50 [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, exported_instance) (rate(rippled_rpc_method_duration_us_bucket{exported_instance=~\"$node\", method=~\"$method\"}[5m])))", + "legendFormat": "P95 [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le, exported_instance) (rate(rippled_rpc_method_duration_us_bucket{exported_instance=~\"$node\", method=~\"$method\"}[5m])))", + "legendFormat": "P99 [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "us", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 5, + "axisLabel": "Duration (μs)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Per-Method Latency P95 (Top 10 Slowest)", + "description": "95th percentile execution time per method. Identifies the slowest RPC endpoints.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, histogram_quantile(0.95, sum by (le, method, exported_instance) (rate(rippled_rpc_method_duration_us_bucket{exported_instance=~\"$node\", method=~\"$method\"}[5m]))))", + "legendFormat": "{{method}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "us", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5, + "axisLabel": "Duration (μs)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "RPC Error Ratio by Method", + "description": "Error ratio (errors / total started) per method. Values above 0.05 (5%) warrant investigation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["mean", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(10, rate(rippled_rpc_method_errored_total{exported_instance=~\"$node\", method=~\"$method\"}[5m]) / (rate(rippled_rpc_method_started_total{exported_instance=~\"$node\", method=~\"$method\"}[5m]) > 0))", + "legendFormat": "{{method}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5, + "axisLabel": "Error Ratio", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.05 + }, + { + "color": "red", + "value": 0.25 + } + ] + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "otel", "rpc"], + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id)", + "type": "query", + "query": "label_values(exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "method", + "label": "RPC Method", + "description": "Filter by RPC method", + "type": "query", + "query": "label_values(rippled_rpc_method_started_total, method)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "RPC Performance (OTel)", + "uid": "rippled-rpc-perf", + "version": 1 +} diff --git a/docker/telemetry/grafana/dashboards/system-node-health.json b/docker/telemetry/grafana/dashboards/system-node-health.json index 456c62b2e1..546a5f12a2 100644 --- a/docker/telemetry/grafana/dashboards/system-node-health.json +++ b/docker/telemetry/grafana/dashboards/system-node-health.json @@ -52,7 +52,8 @@ "value": 20 } ] - } + }, + "custom": {} }, "overrides": [] } @@ -100,7 +101,8 @@ "value": 20 } ] - } + }, + "custom": {} }, "overrides": [] } @@ -351,7 +353,8 @@ ], "fieldConfig": { "defaults": { - "unit": "ops" + "unit": "ops", + "custom": {} }, "overrides": [] } @@ -395,6 +398,324 @@ "value": 0.01 } ] + }, + "custom": {} + }, + "overrides": [] + } + }, + { + "title": "--- OTel: NodeStore I/O ---", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "collapsed": false, + "panels": [] + }, + { + "title": "NodeStore Read/Write Totals", + "description": "Cumulative NodeStore read and write operation counts. Sourced from MetricsRegistry nodestore_state observable gauge with metric=node_reads_total, node_writes, node_reads_hit.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_nodestore_state{exported_instance=~\"$node\", metric=\"node_reads_total\"}", + "legendFormat": "Reads Total [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_nodestore_state{exported_instance=~\"$node\", metric=\"node_reads_hit\"}", + "legendFormat": "Reads Hit (cache) [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_nodestore_state{exported_instance=~\"$node\", metric=\"node_writes\"}", + "legendFormat": "Writes Total [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Operations", + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "NodeStore Write Load & Read Queue", + "description": "Instantaneous write load score and read queue depth. High write load indicates backend pressure. High read queue indicates prefetch thread saturation.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 33 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_nodestore_state{exported_instance=~\"$node\", metric=\"write_load\"}", + "legendFormat": "Write Load [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_nodestore_state{exported_instance=~\"$node\", metric=\"read_queue\"}", + "legendFormat": "Read Queue [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Count", + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 1000 + } + ] + } + }, + "overrides": [] + } + }, + { + "title": "--- OTel: Cache Hit Rates ---", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "collapsed": false, + "panels": [] + }, + { + "title": "Cache Hit Rates", + "description": "Hit rates for SLE cache, Ledger cache, and AcceptedLedger cache. Values from 0.0 to 1.0. Low values indicate cache thrashing. Sourced from MetricsRegistry cache_metrics observable gauge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"SLE_hit_rate\"}", + "legendFormat": "SLE Hit Rate [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"ledger_hit_rate\"}", + "legendFormat": "Ledger Hit Rate [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"AL_hit_rate\"}", + "legendFormat": "AcceptedLedger Hit Rate [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10, + "axisLabel": "Hit Rate", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "Cache Sizes", + "description": "TreeNode cache size, TreeNode track size, and FullBelow cache size. Sourced from MetricsRegistry cache_metrics observable gauge.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"treenode_cache_size\"}", + "legendFormat": "TreeNode Cache [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"treenode_track_size\"}", + "legendFormat": "TreeNode Track [{{exported_instance}}]" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "rippled_cache_metrics{exported_instance=~\"$node\", metric=\"fullbelow_size\"}", + "legendFormat": "FullBelow [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Entries", + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 10 + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "title": "--- OTel: Object Instance Counts ---", + "type": "row", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 50 + }, + "collapsed": false, + "panels": [] + }, + { + "title": "Object Instance Counts", + "description": "Live instance counts for key internal object types tracked by CountedObject. Sourced from MetricsRegistry object_count observable gauge. High counts may indicate memory pressure or object leaks.", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 51 + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": ["last", "max"] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "topk(15, rippled_object_count{exported_instance=~\"$node\", type=~\"$type\"})", + "legendFormat": "{{type}} [{{exported_instance}}]" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "axisLabel": "Instances", + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 5 + }, + "color": { + "mode": "palette-classic" } }, "overrides": [] @@ -402,7 +723,7 @@ } ], "schemaVersion": 39, - "tags": ["rippled", "statsd", "node-health", "telemetry"], + "tags": ["rippled", "statsd", "otel", "node-health", "telemetry"], "templating": { "list": [ { @@ -424,6 +745,26 @@ "multi": true, "refresh": 2, "sort": 1 + }, + { + "name": "type", + "label": "Object Type", + "description": "Filter by internal object type (CountedObject class name)", + "type": "query", + "query": "label_values(rippled_object_count, type)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 } ] }, diff --git a/docker/telemetry/integration-test.sh b/docker/telemetry/integration-test.sh index 6ebe0b2ecb..0938a02984 100755 --- a/docker/telemetry/integration-test.sh +++ b/docker/telemetry/integration-test.sh @@ -355,6 +355,7 @@ trace_transactions=1 trace_consensus=1 trace_peer=1 trace_ledger=1 +metrics_endpoint=http://localhost:4318/v1/metrics [insight] server=otel @@ -639,6 +640,53 @@ else fail "StatsD port 8125 appears to be listening (should not be needed)" fi +# --------------------------------------------------------------------------- +# Step 10c: Verify Phase 9 OTel SDK Metrics +# --------------------------------------------------------------------------- +log "" +log "--- Phase 9: OTel SDK Metrics (MetricsRegistry) ---" +log "Waiting 15s for OTel metric export + Prometheus scrape..." +sleep 15 + +check_otel_metric() { + local metric_name="$1" + local result + result=$(curl -sf "$PROM/api/v1/query?query=$metric_name" \ + | jq '.data.result | length' 2>/dev/null || echo 0) + if [ "$result" -gt 0 ]; then + ok "OTel: $metric_name ($result series)" + else + fail "OTel: $metric_name (0 series)" + fi +} + +# Task 9.1: NodeStore I/O +check_otel_metric 'rippled_nodestore_state{metric="node_reads_total"}' +check_otel_metric 'rippled_nodestore_state{metric="write_load"}' + +# Task 9.2: Cache hit rates +check_otel_metric 'rippled_cache_metrics{metric="SLE_hit_rate"}' +check_otel_metric 'rippled_cache_metrics{metric="treenode_cache_size"}' + +# Task 9.3: TxQ metrics +check_otel_metric 'rippled_txq_metrics{metric="txq_count"}' +check_otel_metric 'rippled_txq_metrics{metric="txq_reference_fee_level"}' + +# Task 9.4: Per-RPC metrics +check_otel_metric "rippled_rpc_method_started_total" +check_otel_metric "rippled_rpc_method_finished_total" + +# Task 9.5: Per-job metrics +check_otel_metric "rippled_job_queued_total" +check_otel_metric "rippled_job_finished_total" + +# Task 9.6: Counted object instances +check_otel_metric "rippled_object_count" + +# Task 9.7: Load factor breakdown +check_otel_metric 'rippled_load_factor_metrics{metric="load_factor"}' +check_otel_metric 'rippled_load_factor_metrics{metric="load_factor_server"}' + # --------------------------------------------------------------------------- # Step 11: Summary # --------------------------------------------------------------------------- diff --git a/include/xrpl/core/ServiceRegistry.h b/include/xrpl/core/ServiceRegistry.h index aa0d9c495c..ff250453b8 100644 --- a/include/xrpl/core/ServiceRegistry.h +++ b/include/xrpl/core/ServiceRegistry.h @@ -20,7 +20,8 @@ class PerfLog; } namespace telemetry { class Telemetry; -} +class MetricsRegistry; +} // namespace telemetry // This is temporary until we migrate all code to use ServiceRegistry. class Application; @@ -224,6 +225,12 @@ public: virtual telemetry::Telemetry& getTelemetry() = 0; + /** Return the MetricsRegistry, or nullptr if telemetry is disabled. + Used by PerfLog and other hot paths to record OTel metrics. + */ + virtual telemetry::MetricsRegistry* + getMetricsRegistry() = 0; + // Configuration and state virtual bool isStopping() const = 0; diff --git a/src/tests/libxrpl/CMakeLists.txt b/src/tests/libxrpl/CMakeLists.txt index 86e00614e1..2c2bd64acb 100644 --- a/src/tests/libxrpl/CMakeLists.txt +++ b/src/tests/libxrpl/CMakeLists.txt @@ -62,5 +62,14 @@ if(telemetry) xrpl.test.telemetry PRIVATE opentelemetry-cpp::opentelemetry-cpp ) +else() + # MetricsRegistry lives in xrpld; compile its .cpp directly into the test + # target so the no-op path can be tested without linking all of xrpld. + # When telemetry=ON, XRPL_ENABLE_TELEMETRY is globally defined and the + # .cpp pulls in xrpld symbols we cannot satisfy here. + target_sources( + xrpl.test.telemetry + PRIVATE ${CMAKE_SOURCE_DIR}/src/xrpld/telemetry/MetricsRegistry.cpp + ) endif() add_dependencies(xrpl.tests xrpl.test.telemetry) diff --git a/src/tests/libxrpl/telemetry/MetricsRegistry.cpp b/src/tests/libxrpl/telemetry/MetricsRegistry.cpp new file mode 100644 index 0000000000..2e11d37819 --- /dev/null +++ b/src/tests/libxrpl/telemetry/MetricsRegistry.cpp @@ -0,0 +1,346 @@ +/** GTest unit tests for MetricsRegistry (no-op / telemetry-disabled path). + * + * Tests cover: + * - Construction with telemetry disabled (no-op behavior). + * - start()/stop() lifecycle when disabled. + * - Synchronous instrument recording methods do not crash when disabled. + * - Double stop() is safe. + * - Destructor handles cleanup without crash. + * + * NOTE: These tests only exercise the no-op path (telemetry disabled). + * When XRPL_ENABLE_TELEMETRY is defined, MetricsRegistry.cpp pulls in + * xrpld symbols that cannot be linked into this standalone test binary, + * so the tests are compiled out. + */ + +// When telemetry is globally enabled, MetricsRegistry.cpp requires xrpld +// link dependencies we cannot satisfy in a standalone GTest binary. +#ifndef XRPL_ENABLE_TELEMETRY + +#include + +#include + +#include + +using namespace xrpl; + +namespace { + +/** Minimal mock ServiceRegistry for MetricsRegistry testing. + * + * Only the getMetricsRegistry() call is used in the tests; other methods + * are not invoked because the registry is disabled (enabled=false) so no + * gauge callbacks execute. + * + * All pure virtual methods throw to catch accidental calls during tests. + */ +class MockServiceRegistry : public ServiceRegistry +{ + [[noreturn]] void + throwUnimplemented() const + { + throw std::logic_error("MockServiceRegistry: method not implemented"); + } + +public: + // ServiceRegistry interface — stubs that should never be called. + CollectorManager& + getCollectorManager() override + { + throwUnimplemented(); + } + Family& + getNodeFamily() override + { + throwUnimplemented(); + } + TimeKeeper& + timeKeeper() override + { + throwUnimplemented(); + } + JobQueue& + getJobQueue() override + { + throwUnimplemented(); + } + NodeCache& + getTempNodeCache() override + { + throwUnimplemented(); + } + CachedSLEs& + cachedSLEs() override + { + throwUnimplemented(); + } + NetworkIDService& + getNetworkIDService() override + { + throwUnimplemented(); + } + AmendmentTable& + getAmendmentTable() override + { + throwUnimplemented(); + } + HashRouter& + getHashRouter() override + { + throwUnimplemented(); + } + LoadFeeTrack& + getFeeTrack() override + { + throwUnimplemented(); + } + LoadManager& + getLoadManager() override + { + throwUnimplemented(); + } + RCLValidations& + getValidations() override + { + throwUnimplemented(); + } + ValidatorList& + validators() override + { + throwUnimplemented(); + } + ValidatorSite& + validatorSites() override + { + throwUnimplemented(); + } + ManifestCache& + validatorManifests() override + { + throwUnimplemented(); + } + ManifestCache& + publisherManifests() override + { + throwUnimplemented(); + } + Overlay& + overlay() override + { + throwUnimplemented(); + } + Cluster& + cluster() override + { + throwUnimplemented(); + } + PeerReservationTable& + peerReservations() override + { + throwUnimplemented(); + } + Resource::Manager& + getResourceManager() override + { + throwUnimplemented(); + } + NodeStore::Database& + getNodeStore() override + { + throwUnimplemented(); + } + SHAMapStore& + getSHAMapStore() override + { + throwUnimplemented(); + } + RelationalDatabase& + getRelationalDatabase() override + { + throwUnimplemented(); + } + InboundLedgers& + getInboundLedgers() override + { + throwUnimplemented(); + } + InboundTransactions& + getInboundTransactions() override + { + throwUnimplemented(); + } + TaggedCache& + getAcceptedLedgerCache() override + { + throwUnimplemented(); + } + LedgerMaster& + getLedgerMaster() override + { + throwUnimplemented(); + } + LedgerCleaner& + getLedgerCleaner() override + { + throwUnimplemented(); + } + LedgerReplayer& + getLedgerReplayer() override + { + throwUnimplemented(); + } + PendingSaves& + pendingSaves() override + { + throwUnimplemented(); + } + OpenLedger& + openLedger() override + { + throwUnimplemented(); + } + OpenLedger const& + openLedger() const override + { + throwUnimplemented(); + } + NetworkOPs& + getOPs() override + { + throwUnimplemented(); + } + OrderBookDB& + getOrderBookDB() override + { + throwUnimplemented(); + } + TransactionMaster& + getMasterTransaction() override + { + throwUnimplemented(); + } + TxQ& + getTxQ() override + { + throwUnimplemented(); + } + PathRequests& + getPathRequests() override + { + throwUnimplemented(); + } + ServerHandler& + getServerHandler() override + { + throwUnimplemented(); + } + perf::PerfLog& + getPerfLog() override + { + throwUnimplemented(); + } + telemetry::Telemetry& + getTelemetry() override + { + throwUnimplemented(); + } + telemetry::MetricsRegistry* + getMetricsRegistry() override + { + return nullptr; + } + bool + isStopping() const override + { + return false; + } + beast::Journal + journal(std::string const&) override + { + return beast::Journal(beast::Journal::getNullSink()); + } + boost::asio::io_context& + getIOContext() override + { + throwUnimplemented(); + } + Logs& + logs() override + { + throwUnimplemented(); + } + std::optional const& + trapTxID() const override + { + static std::optional const empty; + return empty; + } + DatabaseCon& + getWalletDB() override + { + throwUnimplemented(); + } + Application& + app() override + { + throwUnimplemented(); + } +}; + +/// Test fixture that provides a MockServiceRegistry and null Journal. +class MetricsRegistryTest : public ::testing::Test +{ +protected: + MockServiceRegistry mockApp_; + beast::Journal j_{beast::Journal::getNullSink()}; +}; + +} // namespace + +TEST_F(MetricsRegistryTest, disabled_construction) +{ + // Construct with enabled=false; should be a no-op. + telemetry::MetricsRegistry registry(false, mockApp_, j_); + EXPECT_FALSE(registry.isEnabled()); +} + +TEST_F(MetricsRegistryTest, disabled_start_stop) +{ + telemetry::MetricsRegistry registry(false, mockApp_, j_); + + // start() and stop() should be no-ops when disabled. + registry.start("http://localhost:4318/v1/metrics"); + registry.stop(); + + // Double stop should be safe. + registry.stop(); +} + +TEST_F(MetricsRegistryTest, disabled_recording_methods) +{ + telemetry::MetricsRegistry registry(false, mockApp_, j_); + registry.start("http://localhost:4318/v1/metrics"); + + // All recording methods should be no-ops (not crash). + registry.recordRpcStarted("server_info"); + registry.recordRpcFinished("server_info", 1000); + registry.recordRpcErrored("ledger", 500); + registry.recordJobQueued("ledgerData"); + registry.recordJobStarted("ledgerData", 200); + registry.recordJobFinished("ledgerData", 3000); + + registry.stop(); +} + +TEST_F(MetricsRegistryTest, destructor_calls_stop) +{ + { + // Let the destructor handle cleanup. + telemetry::MetricsRegistry registry(false, mockApp_, j_); + registry.start("http://localhost:4318/v1/metrics"); + } + // If we get here without crash, the destructor handled stop. +} + +#endif // !XRPL_ENABLE_TELEMETRY diff --git a/src/xrpld/app/main/Application.cpp b/src/xrpld/app/main/Application.cpp index 4c394de0dc..3d8a59ca85 100644 --- a/src/xrpld/app/main/Application.cpp +++ b/src/xrpld/app/main/Application.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -149,6 +150,9 @@ public: beast::Journal m_journal; std::unique_ptr perfLog_; std::unique_ptr telemetry_; + /// OTel metrics registry for gap-fill metrics (counters, histograms, + /// observable gauges). Created after telemetry_ during setup(). + std::unique_ptr metricsRegistry_; Application::MutexType m_masterMutex; // Required by the SHAMapStore @@ -640,6 +644,12 @@ public: return *telemetry_; } + telemetry::MetricsRegistry* + getMetricsRegistry() override + { + return metricsRegistry_.get(); + } + NodeCache& getTempNodeCache() override { @@ -1289,6 +1299,11 @@ ApplicationImp::setup(boost::program_options::variables_map const& cmdline) if (!config_->section("telemetry").exists("service_instance_id")) telemetry_->setServiceInstanceId(toBase58(TokenType::NodePublic, nodeIdentity_->first)); + // Create the OTel MetricsRegistry for gap-fill metrics (counters, + // histograms, observable gauges). It is started later in start(). + metricsRegistry_ = std::make_unique( + telemetry_->isEnabled(), *this, logs_->journal("MetricsRegistry")); + if (!cluster_->load(config().section(SECTION_CLUSTER_NODES))) { JLOG(m_journal.fatal()) << "Invalid entry in cluster configuration."; @@ -1502,6 +1517,24 @@ ApplicationImp::start(bool withTimers) ledgerCleaner_->start(); perfLog_->start(); telemetry_->start(); + + // Start the metrics pipeline after telemetry; the endpoint uses the + // same base URL but the /v1/metrics path. + if (metricsRegistry_) + { + auto const& section = config_->section("telemetry"); + std::string endpoint = "http://localhost:4318/v1/metrics"; + set(endpoint, "metrics_endpoint", section); + + // Pass the service_instance_id so the MeterProvider Resource + // carries it, giving Prometheus an exported_instance label. + std::string instanceId; + set(instanceId, "service_instance_id", section); + if (instanceId.empty() && nodeIdentity_) + instanceId = toBase58(TokenType::NodePublic, nodeIdentity_->first); + + metricsRegistry_->start(endpoint, instanceId); + } } void @@ -1592,6 +1625,10 @@ ApplicationImp::run() ledgerCleaner_->stop(); m_nodeStore->stop(); perfLog_->stop(); + // Stop metrics pipeline before telemetry — gauge callbacks reference + // Application services that may be shutting down. + if (metricsRegistry_) + metricsRegistry_->stop(); // Telemetry must stop last among trace-producing components. // serverHandler_, overlay_, and jobQueue_ are already stopped above, // so no threads should be calling startSpan() at this point. diff --git a/src/xrpld/perflog/detail/PerfLogImp.cpp b/src/xrpld/perflog/detail/PerfLogImp.cpp index 960fdcb3ac..4618a9f381 100644 --- a/src/xrpld/perflog/detail/PerfLogImp.cpp +++ b/src/xrpld/perflog/detail/PerfLogImp.cpp @@ -1,9 +1,11 @@ #include +#include #include #include #include #include +#include #include #include @@ -316,6 +318,10 @@ PerfLogImp::rpcStart(std::string const& method, std::uint64_t const requestId) } std::lock_guard lock(counters_.methodsMutex_); counters_.methods_[requestId] = {counter->first.c_str(), steady_clock::now()}; + + // Task 9.4: Record RPC start in OTel metrics pipeline. + if (auto* mr = app_.getMetricsRegistry()) + mr->recordRpcStarted(method); } void @@ -371,6 +377,10 @@ PerfLogImp::jobQueue(JobType const type) } std::lock_guard lock(counter->second.mutex); ++counter->second.value.queued; + + // Task 9.5: Record job enqueue in OTel metrics pipeline. + if (auto* mr = app_.getMetricsRegistry()) + mr->recordJobQueued(JobTypes::name(type)); } void @@ -397,6 +407,10 @@ PerfLogImp::jobStart( std::lock_guard lock(counters_.jobsMutex_); if (instance >= 0 && instance < counters_.jobs_.size()) counters_.jobs_[instance] = {type, startTime}; + + // Task 9.5: Record job start in OTel metrics pipeline. + if (auto* mr = app_.getMetricsRegistry()) + mr->recordJobStarted(JobTypes::name(type), dur.count()); } void @@ -419,6 +433,10 @@ PerfLogImp::jobFinish(JobType const type, microseconds dur, int instance) std::lock_guard lock(counters_.jobsMutex_); if (instance >= 0 && instance < counters_.jobs_.size()) counters_.jobs_[instance] = {jtINVALID, steady_time_point()}; + + // Task 9.5: Record job finish in OTel metrics pipeline. + if (auto* mr = app_.getMetricsRegistry()) + mr->recordJobFinished(JobTypes::name(type), dur.count()); } void diff --git a/src/xrpld/telemetry/MetricsRegistry.cpp b/src/xrpld/telemetry/MetricsRegistry.cpp new file mode 100644 index 0000000000..99c94efc85 --- /dev/null +++ b/src/xrpld/telemetry/MetricsRegistry.cpp @@ -0,0 +1,513 @@ +/** MetricsRegistry implementation — OpenTelemetry metric instruments for rippled. + + This file contains: + - Construction / destruction logic for the OTel MeterProvider pipeline. + - Synchronous instrument creation (counters, histograms) for RPC, job + queue, and NodeStore I/O metrics. + - Observable gauge callback registration for cache hit rates, TxQ state, + CountedObject instances, load factors, and NodeStore queue depth. + - No-op stubs when XRPL_ENABLE_TELEMETRY is not defined. +*/ + +// On Windows, OTel's spin_lock_mutex.h (transitively included from +// MetricsRegistry.h) defines _WINSOCKAPI_ and includes . +// This poisons the include state for boost/asio/detail/socket_types.hpp, +// which requires winsock2.h to be included first. Pre-including the +// boost/asio socket types header gets winsock2.h in before the OTel +// headers can interfere. +#ifdef _MSC_VER +#include +#endif + +#include + +#ifdef XRPL_ENABLE_TELEMETRY + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace metric_sdk = opentelemetry::sdk::metrics; +namespace otlp_http = opentelemetry::exporter::otlp; +namespace resource = opentelemetry::sdk::resource; + +#endif // XRPL_ENABLE_TELEMETRY + +namespace xrpl { +namespace telemetry { + +MetricsRegistry::MetricsRegistry(bool enabled, ServiceRegistry& app, beast::Journal journal) + : enabled_(enabled), app_(app), journal_(journal) +{ +} + +MetricsRegistry::~MetricsRegistry() +{ + stop(); +} + +void +MetricsRegistry::start(std::string const& endpoint, std::string const& instanceId) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_) + return; + + JLOG(journal_.info()) << "MetricsRegistry: starting, endpoint=" << endpoint + << ", instanceId=" << instanceId; + + // Configure OTLP/HTTP metric exporter. + otlp_http::OtlpHttpMetricExporterOptions exporterOpts; + exporterOpts.url = endpoint; + auto exporter = otlp_http::OtlpHttpMetricExporterFactory::Create(exporterOpts); + + // Configure periodic reader with 10-second export interval. + metric_sdk::PeriodicExportingMetricReaderOptions readerOpts; + readerOpts.export_interval_millis = std::chrono::milliseconds(10000); + readerOpts.export_timeout_millis = std::chrono::milliseconds(5000); + auto reader = + metric_sdk::PeriodicExportingMetricReaderFactory::Create(std::move(exporter), readerOpts); + + // Configure resource attributes so Prometheus exported_instance labels + // distinguish metrics from different nodes (matches OTelCollector setup). + resource::ResourceAttributes attrs; + attrs[resource::SemanticConventions::kServiceName] = "rippled"; + if (!instanceId.empty()) + attrs[resource::SemanticConventions::kServiceInstanceId] = instanceId; + auto resourceAttrs = resource::Resource::Create(attrs); + + // Create MeterProvider with resource, then attach the metric reader. + provider_ = metric_sdk::MeterProviderFactory::Create( + std::make_unique(), resourceAttrs); + provider_->AddMetricReader(std::move(reader)); + + // Get a meter for all rippled instruments. + meter_ = provider_->GetMeter("rippled", "1.0.0"); + + // --- Create synchronous instruments --- + + // RPC per-method counters and histogram. + rpcStartedCounter_ = meter_->CreateUInt64Counter( + "rippled_rpc_method_started_total", "Total RPC method calls started"); + rpcFinishedCounter_ = meter_->CreateUInt64Counter( + "rippled_rpc_method_finished_total", "Total RPC method calls completed successfully"); + rpcErroredCounter_ = meter_->CreateUInt64Counter( + "rippled_rpc_method_errored_total", "Total RPC method calls that errored"); + rpcDurationHistogram_ = meter_->CreateDoubleHistogram( + "rippled_rpc_method_duration_us", "RPC method execution time in microseconds"); + + // Job queue per-type counters and histograms. + jobQueuedCounter_ = + meter_->CreateUInt64Counter("rippled_job_queued_total", "Total jobs enqueued"); + jobStartedCounter_ = + meter_->CreateUInt64Counter("rippled_job_started_total", "Total jobs started"); + jobFinishedCounter_ = + meter_->CreateUInt64Counter("rippled_job_finished_total", "Total jobs completed"); + jobQueuedDurationHistogram_ = meter_->CreateDoubleHistogram( + "rippled_job_queued_duration_us", "Time jobs spent waiting in the queue (microseconds)"); + jobRunningDurationHistogram_ = meter_->CreateDoubleHistogram( + "rippled_job_running_duration_us", "Job execution time in microseconds"); + + // Register all observable (async) gauges. + registerAsyncGauges(); + + JLOG(journal_.info()) << "MetricsRegistry: started successfully"; +#else + (void)endpoint; +#endif // XRPL_ENABLE_TELEMETRY +} + +void +MetricsRegistry::stop() +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!provider_) + return; + + JLOG(journal_.info()) << "MetricsRegistry: stopping"; + + // Force-flush any pending metrics, then destroy the provider. + // This stops the PeriodicExportingMetricReader, which in turn + // stops invoking observable gauge callbacks. No explicit + // RemoveCallback is needed — the provider destruction handles it. + provider_->ForceFlush(); + provider_.reset(); + + JLOG(journal_.info()) << "MetricsRegistry: stopped"; +#endif // XRPL_ENABLE_TELEMETRY +} + +// ----------------------------------------------------------------- +// Synchronous instrument recording — RPC metrics (Task 9.4) +// ----------------------------------------------------------------- + +void +MetricsRegistry::recordRpcStarted(std::string_view method) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !rpcStartedCounter_) + return; + rpcStartedCounter_->Add(1, {{"method", std::string(method)}}); +#else + (void)method; +#endif +} + +void +MetricsRegistry::recordRpcFinished(std::string_view method, std::int64_t durationUs) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !rpcFinishedCounter_) + return; + rpcFinishedCounter_->Add(1, {{"method", std::string(method)}}); + if (rpcDurationHistogram_) + rpcDurationHistogram_->Record( + static_cast(durationUs), + {{"method", std::string(method)}}, + opentelemetry::context::Context{}); +#else + (void)method; + (void)durationUs; +#endif +} + +void +MetricsRegistry::recordRpcErrored(std::string_view method, std::int64_t durationUs) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !rpcErroredCounter_) + return; + rpcErroredCounter_->Add(1, {{"method", std::string(method)}}); + if (rpcDurationHistogram_) + rpcDurationHistogram_->Record( + static_cast(durationUs), + {{"method", std::string(method)}}, + opentelemetry::context::Context{}); +#else + (void)method; + (void)durationUs; +#endif +} + +// ----------------------------------------------------------------- +// Synchronous instrument recording — Job Queue metrics (Task 9.5) +// ----------------------------------------------------------------- + +void +MetricsRegistry::recordJobQueued(std::string_view jobType) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !jobQueuedCounter_) + return; + jobQueuedCounter_->Add(1, {{"job_type", std::string(jobType)}}); +#else + (void)jobType; +#endif +} + +void +MetricsRegistry::recordJobStarted(std::string_view jobType, std::int64_t queuedDurUs) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !jobStartedCounter_) + return; + jobStartedCounter_->Add(1, {{"job_type", std::string(jobType)}}); + if (jobQueuedDurationHistogram_) + jobQueuedDurationHistogram_->Record( + static_cast(queuedDurUs), + {{"job_type", std::string(jobType)}}, + opentelemetry::context::Context{}); +#else + (void)jobType; + (void)queuedDurUs; +#endif +} + +void +MetricsRegistry::recordJobFinished(std::string_view jobType, std::int64_t runningDurUs) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (!enabled_ || !jobFinishedCounter_) + return; + jobFinishedCounter_->Add(1, {{"job_type", std::string(jobType)}}); + if (jobRunningDurationHistogram_) + jobRunningDurationHistogram_->Record( + static_cast(runningDurUs), + {{"job_type", std::string(jobType)}}, + opentelemetry::context::Context{}); +#else + (void)jobType; + (void)runningDurUs; +#endif +} + +// ----------------------------------------------------------------- +// Observable gauge callbacks (Tasks 9.1, 9.2, 9.3, 9.6, 9.7) +// ----------------------------------------------------------------- + +#ifdef XRPL_ENABLE_TELEMETRY + +void +MetricsRegistry::registerAsyncGauges() +{ + // --- Task 9.2: Cache hit rate and size gauges --- + cacheHitRateGauge_ = + meter_->CreateDoubleObservableGauge("rippled_cache_metrics", "Cache hit rates and sizes"); + cacheHitRateGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + auto& app = self->app_; + + try + { + // SLE cache hit rate (0.0 - 1.0). + auto sleRate = app.cachedSLEs().rate(); + opentelemetry::nostd::get>>(result) + ->Observe(sleRate, {{"metric", "SLE_hit_rate"}}); + + // Ledger cache hit rate. + // TaggedCache::getHitRate() returns 0-100; normalize to + // 0.0-1.0 so the Grafana panel using "percentunit" renders + // correctly. + auto ledgerRate = app.getLedgerMaster().getCacheHitRate() / 100.0; + opentelemetry::nostd::get>>(result) + ->Observe(ledgerRate, {{"metric", "ledger_hit_rate"}}); + + // AcceptedLedger cache hit rate (also 0-100 from + // TaggedCache; normalize to 0.0-1.0). + auto alRate = app.getAcceptedLedgerCache().getHitRate() / 100.0; + opentelemetry::nostd::get>>(result) + ->Observe(alRate, {{"metric", "AL_hit_rate"}}); + + // TreeNode cache size. + auto tnCacheSize = app.getNodeFamily().getTreeNodeCache()->getCacheSize(); + opentelemetry::nostd::get>>(result) + ->Observe( + static_cast(tnCacheSize), {{"metric", "treenode_cache_size"}}); + + // TreeNode track size. + auto tnTrackSize = app.getNodeFamily().getTreeNodeCache()->getTrackSize(); + opentelemetry::nostd::get>>(result) + ->Observe( + static_cast(tnTrackSize), {{"metric", "treenode_track_size"}}); + + // FullBelow cache size. + auto fbSize = app.getNodeFamily().getFullBelowCache()->size(); + opentelemetry::nostd::get>>(result) + ->Observe(static_cast(fbSize), {{"metric", "fullbelow_size"}}); + } + catch (...) // NOLINT(bugprone-empty-catch) + { + // Silently skip if services are not yet ready. + } + }, + this); + + // --- Task 9.3: TxQ metrics gauges --- + txqGauge_ = + meter_->CreateDoubleObservableGauge("rippled_txq_metrics", "Transaction queue metrics"); + txqGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + auto& app = self->app_; + + try + { + auto const metrics = app.getTxQ().getMetrics(*app.openLedger().current()); + + auto observe = [&](char const* name, double value) { + opentelemetry::nostd::get>>(result) + ->Observe(value, {{"metric", name}}); + }; + + observe("txq_count", static_cast(metrics.txCount)); + observe( + "txq_max_size", + metrics.txQMaxSize ? static_cast(*metrics.txQMaxSize) : 0.0); + observe("txq_in_ledger", static_cast(metrics.txInLedger)); + observe("txq_per_ledger", static_cast(metrics.txPerLedger)); + observe( + "txq_reference_fee_level", + static_cast(metrics.referenceFeeLevel.fee())); + observe( + "txq_min_processing_fee_level", + static_cast(metrics.minProcessingFeeLevel.fee())); + observe("txq_med_fee_level", static_cast(metrics.medFeeLevel.fee())); + observe( + "txq_open_ledger_fee_level", + static_cast(metrics.openLedgerFeeLevel.fee())); + } + catch (...) // NOLINT(bugprone-empty-catch) + { + // Silently skip if TxQ or OpenLedger are not yet ready. + } + }, + this); + + // --- Task 9.6: Counted object instance gauges --- + objectCountGauge_ = meter_->CreateInt64ObservableGauge( + "rippled_object_count", "Live instance counts for key internal object types"); + objectCountGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* /* state */) { + try + { + // Iterate through all CountedObject types via the linked + // list in CountedObjects. We report all types with count + // > 0, filtering to the key types of interest. + auto counts = CountedObjects::getInstance().getCounts(0); + for (auto const& [name, count] : counts) + { + opentelemetry::nostd::get>>(result) + ->Observe(static_cast(count), {{"type", name}}); + } + } + catch (...) // NOLINT(bugprone-empty-catch) + { + // Silently skip on error. + } + }, + this); + + // --- Task 9.7: Load factor breakdown gauges --- + loadFactorGauge_ = meter_->CreateDoubleObservableGauge( + "rippled_load_factor_metrics", "Fee load factor breakdown"); + loadFactorGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + auto& app = self->app_; + + try + { + auto& feeTrack = app.getFeeTrack(); + auto const loadBase = static_cast(feeTrack.getLoadBase()); + + auto observe = [&](char const* name, double value) { + opentelemetry::nostd::get>>(result) + ->Observe(value, {{"metric", name}}); + }; + + // Combined load factor (server component). + observe( + "load_factor_server", static_cast(feeTrack.getLoadFactor()) / loadBase); + + // Individual factor components. + observe( + "load_factor_local", static_cast(feeTrack.getLocalFee()) / loadBase); + observe("load_factor_net", static_cast(feeTrack.getRemoteFee()) / loadBase); + observe( + "load_factor_cluster", + static_cast(feeTrack.getClusterFee()) / loadBase); + + // Fee escalation factors from TxQ. + auto const metrics = app.getTxQ().getMetrics(*app.openLedger().current()); + auto refLevel = static_cast(metrics.referenceFeeLevel.fee()); + if (refLevel > 0) + { + observe( + "load_factor_fee_escalation", + static_cast(metrics.openLedgerFeeLevel.fee()) / refLevel); + observe( + "load_factor_fee_queue", + static_cast(metrics.minProcessingFeeLevel.fee()) / refLevel); + } + + // Combined load factor (max of server and fee escalation). + auto const loadFactorServer = feeTrack.getLoadFactor(); + auto const loadBaseServer = feeTrack.getLoadBase(); + double combined = static_cast(loadFactorServer) / loadBase; + if (refLevel > 0) + { + double feeEscalation = static_cast(metrics.openLedgerFeeLevel.fee()) * + loadBaseServer / refLevel; + if (feeEscalation > static_cast(loadFactorServer)) + { + combined = feeEscalation / loadBase; + } + } + observe("load_factor", combined); + } + catch (...) // NOLINT(bugprone-empty-catch) + { + // Silently skip if services are not yet ready. + } + }, + this); + + // --- Task 9.1: NodeStore I/O gauges --- + // The cumulative counters (reads, writes, bytes) are also exposed here + // as observable gauges. This avoids adding an xrpld dependency into the + // libxrpl nodestore code — the MetricsRegistry reads the existing atomic + // counters from Database via its public accessors. + nodeStoreGauge_ = meter_->CreateInt64ObservableGauge( + "rippled_nodestore_state", "NodeStore I/O counters, queue depth, and write load"); + nodeStoreGauge_->AddCallback( + [](opentelemetry::metrics::ObserverResult result, void* state) { + auto* self = static_cast(state); + auto& app = self->app_; + + try + { + auto& db = app.getNodeStore(); + + auto observe = [&](char const* name, int64_t value) { + opentelemetry::nostd::get>>(result) + ->Observe(value, {{"metric", name}}); + }; + + // Cumulative counters (monotonically increasing). + observe("node_reads_total", static_cast(db.getFetchTotalCount())); + observe("node_reads_hit", static_cast(db.getFetchHitCount())); + observe("node_writes", static_cast(db.getStoreCount())); + observe("node_written_bytes", static_cast(db.getStoreSize())); + observe("node_read_bytes", static_cast(db.getFetchSize())); + + // Write load score (instantaneous). + observe("write_load", static_cast(db.getWriteLoad())); + + // Read queue depth (instantaneous). + Json::Value obj(Json::objectValue); + db.getCountsJson(obj); + if (obj.isMember("read_queue")) + { + observe("read_queue", static_cast(obj["read_queue"].asUInt())); + } + } + catch (...) // NOLINT(bugprone-empty-catch) + { + // Silently skip on error. + } + }, + this); +} + +#endif // XRPL_ENABLE_TELEMETRY + +} // namespace telemetry +} // namespace xrpl diff --git a/src/xrpld/telemetry/MetricsRegistry.h b/src/xrpld/telemetry/MetricsRegistry.h new file mode 100644 index 0000000000..e6d39892b1 --- /dev/null +++ b/src/xrpld/telemetry/MetricsRegistry.h @@ -0,0 +1,284 @@ +#pragma once + +/** Central OTel Metrics Registry for rippled. + + Owns all OpenTelemetry metric instruments (counters, histograms, + observable gauges) that are NOT already covered by the beast::insight + StatsD pipeline. The instruments are created once at startup and polled + by the OTel PeriodicExportingMetricReader at a configurable interval + (default 10 s). + + When XRPL_ENABLE_TELEMETRY is **not** defined, this class compiles to a + lightweight no-op: every public method is an empty inline. + + Dependency / ownership diagram (ASCII): + + Application + | + +-- MetricsRegistry (unique_ptr, created in setup(), started/stopped with telemetry) + | + +-- OTel MeterProvider (owns reader + exporter) + | | + | +-- PeriodicExportingMetricReader + | +-- OtlpHttpMetricExporter + | + +-- Counters / Histograms (synchronous instruments) + | +-- rippled_rpc_method_started_total + | +-- rippled_rpc_method_finished_total + | +-- rippled_rpc_method_errored_total + | +-- rippled_rpc_method_duration_us (Histogram) + | +-- rippled_job_queued_total + | +-- rippled_job_started_total + | +-- rippled_job_finished_total + | +-- rippled_job_queued_duration_us (Histogram) + | +-- rippled_job_running_duration_us (Histogram) + | + +-- Observable Gauges (async callbacks, polled by reader) + +-- Cache hit rates (SLE, ledger, AL) + +-- TreeNode / FullBelow sizes + +-- TxQ metrics + +-- CountedObject counts + +-- Load factor breakdown + +-- NodeStore I/O gauges + + Control-flow for async gauges: + + PeriodicExportingMetricReader (background thread, 10 s tick) + | + v + OTel SDK invokes registered ObservableGauge callbacks + | + v + Each callback reads current value from Application services + (e.g. app.getTxQ().getMetrics(), app.getFeeTrack().getLoadFactor()) + | + v + Result set is exported via OTLP/HTTP to the collector + + Control-flow for synchronous instruments: + + PerfLogImp::rpcStart/rpcEnd/jobQueue/jobStart/jobFinish + | + v + MetricsRegistry::recordRpc*(method, ...) / recordJob*(type, ...) + | + v + OTel Counter::Add() or Histogram::Record() + | + v + Periodically flushed by the MetricReader + + Example usage: + + @code + // In Application::setup(), after telemetry_ is created: + metricsRegistry_ = std::make_unique( + telemetry_->isEnabled(), app, journal); + metricsRegistry_->start(setup.exporterEndpoint); + + // In PerfLogImp::rpcStart(): + if (auto* mr = app_.getMetricsRegistry()) + mr->recordRpcStarted("server_info"); + + // In PerfLogImp::rpcEnd(): + if (auto* mr = app_.getMetricsRegistry()) + { + mr->recordRpcFinished("server_info", durationUs); + // or: mr->recordRpcErrored("server_info", durationUs); + } + + // In PerfLogImp::jobQueue(): + if (auto* mr = app_.getMetricsRegistry()) + mr->recordJobQueued("ledgerData"); + + // Shutdown: + metricsRegistry_->stop(); + @endcode + + Caveats: + - The MetricsRegistry must be created AFTER the Telemetry object because + it reads isEnabled() to decide whether to initialize the OTel SDK. + - Observable gauge callbacks capture a reference to the Application; the + Application must outlive the MetricsRegistry (guaranteed because + MetricsRegistry is stopped before Application teardown). + - If a new CountedObject type is added, it will NOT appear automatically + in the object_count gauge; the callback iterates a fixed list. + - Adding a new synchronous instrument requires updating both the header + and the .cpp, then calling the new record*() method from the + instrumentation site. +*/ + +#include + +#include +#include +#include +#include + +#ifdef XRPL_ENABLE_TELEMETRY +#include +#include +#include +#include +#endif + +namespace xrpl { + +class ServiceRegistry; + +namespace telemetry { + +class MetricsRegistry +{ +public: + /** Construct a MetricsRegistry. + + @param enabled Whether OTel metric export is active. When false, + all methods become no-ops. + @param app Reference to the ServiceRegistry (Application) for + reading current metric values in gauge callbacks. + @param journal Journal for log output. + */ + MetricsRegistry(bool enabled, ServiceRegistry& app, beast::Journal journal); + + ~MetricsRegistry(); + + /// Non-copyable, non-movable. + MetricsRegistry(MetricsRegistry const&) = delete; + MetricsRegistry& + operator=(MetricsRegistry const&) = delete; + + /** Initialize the OTel metrics pipeline and register all instruments. + + @param endpoint OTLP/HTTP endpoint URL for metric export + (e.g. "http://localhost:4318/v1/metrics"). + @param instanceId Value for the service.instance.id resource + attribute. When non-empty, Prometheus metrics + carry an exported_instance label for per-node + filtering. + */ + void + start(std::string const& endpoint, std::string const& instanceId = {}); + + /** Flush pending metrics and shut down the pipeline. */ + void + stop(); + + /** @return true if the registry is actively exporting metrics. */ + bool + isEnabled() const noexcept + { + return enabled_; + } + + // ----------------------------------------------------------------- + // Synchronous instrument recording (called from PerfLog hot paths) + // ----------------------------------------------------------------- + + /** Record an RPC method call start. + @param method The RPC method name (e.g. "server_info"). + */ + void + recordRpcStarted(std::string_view method); + + /** Record an RPC method call completion. + @param method The RPC method name. + @param durationUs Execution time in microseconds. + */ + void + recordRpcFinished(std::string_view method, std::int64_t durationUs); + + /** Record an RPC method call error. + @param method The RPC method name. + @param durationUs Execution time in microseconds. + */ + void + recordRpcErrored(std::string_view method, std::int64_t durationUs); + + /** Record a job enqueued event. + @param jobType The job type name (e.g. "ledgerData"). + */ + void + recordJobQueued(std::string_view jobType); + + /** Record a job start event. + @param jobType The job type name. + @param queuedDurUs Time the job spent waiting in the queue (us). + */ + void + recordJobStarted(std::string_view jobType, std::int64_t queuedDurUs); + + /** Record a job finish event. + @param jobType The job type name. + @param runningDurUs Execution time in microseconds. + */ + void + recordJobFinished(std::string_view jobType, std::int64_t runningDurUs); + +private: + /// Master enable flag; when false all methods are no-ops. + bool const enabled_; + + /// Reference to Application services for gauge callbacks. + ServiceRegistry& app_; + + /// Journal for logging. + beast::Journal const journal_; + +#ifdef XRPL_ENABLE_TELEMETRY + /// The SDK MeterProvider that owns the export pipeline. + std::shared_ptr provider_; + + /// The Meter used to create all instruments. + opentelemetry::nostd::shared_ptr meter_; + + // --- Synchronous instruments (RPC) --- + /// Counter: rpc_method_started_total{method=""} + opentelemetry::nostd::unique_ptr> rpcStartedCounter_; + /// Counter: rpc_method_finished_total{method=""} + opentelemetry::nostd::unique_ptr> rpcFinishedCounter_; + /// Counter: rpc_method_errored_total{method=""} + opentelemetry::nostd::unique_ptr> rpcErroredCounter_; + /// Histogram: rpc_method_duration_us{method=""} + opentelemetry::nostd::unique_ptr> + rpcDurationHistogram_; + + // --- Synchronous instruments (Job Queue) --- + /// Counter: job_queued_total{job_type=""} + opentelemetry::nostd::unique_ptr> jobQueuedCounter_; + /// Counter: job_started_total{job_type=""} + opentelemetry::nostd::unique_ptr> jobStartedCounter_; + /// Counter: job_finished_total{job_type=""} + opentelemetry::nostd::unique_ptr> jobFinishedCounter_; + /// Histogram: job_queued_duration_us{job_type=""} + opentelemetry::nostd::unique_ptr> + jobQueuedDurationHistogram_; + /// Histogram: job_running_duration_us{job_type=""} + opentelemetry::nostd::unique_ptr> + jobRunningDurationHistogram_; + + // --- Observable gauges (registered via callbacks) --- + // Handles are stored so we can remove callbacks on shutdown. + /// Observable gauges for cache hit rates and sizes. + opentelemetry::nostd::shared_ptr + cacheHitRateGauge_; + /// Observable gauges for TxQ metrics. + opentelemetry::nostd::shared_ptr txqGauge_; + /// Observable gauges for counted object instances. + opentelemetry::nostd::shared_ptr + objectCountGauge_; + /// Observable gauges for load factor breakdown. + opentelemetry::nostd::shared_ptr loadFactorGauge_; + /// Observable gauges for NodeStore write_load and read_queue. + opentelemetry::nostd::shared_ptr nodeStoreGauge_; + + /** Register all observable gauge callbacks with the OTel SDK. + Called once during start(). + */ + void + registerAsyncGauges(); +#endif // XRPL_ENABLE_TELEMETRY +}; + +} // namespace telemetry +} // namespace xrpl