From 360ecbdb446a2e35b4252bf2e535a81048ceb24c Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 24 Apr 2026 21:43:17 +0100 Subject: [PATCH] feat(telemetry): add Phase 5 documentation, deployment configs, and integration tests Add the observability stack deployment infrastructure and integration test framework for verifying end-to-end trace export. - Add Grafana dashboards: RPC performance, transaction overview, consensus health (pre-provisioned via dashboards.yaml) - Add Prometheus config for spanmetrics collection from OTel Collector - Update OTel Collector config with spanmetrics connector and prometheus exporter for RED metrics - Add docker-compose services: prometheus, dashboard provisioning - Add integration-test.sh with Tempo API-based span verification (replaces previous Jaeger-based approach) - Add TESTING.md with step-by-step deployment and verification guide - Add telemetry-runbook.md for production operations reference - Add xrpld-telemetry.cfg sample configuration - Add toDisplayString() for ConsensusMode (human-readable span values) - Update Phase 2/3 task lists with known issues sections - Add Phase 5 integration test task list - Add TraceContext protobuf fields for future relay propagation - Wire telemetry lifecycle (setServiceInstanceId/start/stop) in Application.cpp Co-Authored-By: Claude Opus 4.6 (1M context) --- OpenTelemetryPlan/08-appendix.md | 17 +- OpenTelemetryPlan/Phase2_taskList.md | 33 ++ OpenTelemetryPlan/Phase3_taskList.md | 25 + .../Phase5_IntegrationTest_taskList.md | 221 +++++++ cspell.config.yaml | 12 +- docker/telemetry/.gitignore | 2 + docker/telemetry/TESTING.md | 512 ++++++++++++++++ docker/telemetry/docker-compose.yml | 16 +- .../grafana/dashboards/consensus-health.json | 244 ++++++++ .../grafana/dashboards/rpc-performance.json | 189 ++++++ .../dashboards/transaction-overview.json | 172 ++++++ .../provisioning/dashboards/dashboards.yaml | 12 + .../provisioning/datasources/prometheus.yaml | 10 + .../provisioning/datasources/tempo.yaml | 6 +- docker/telemetry/integration-test.sh | 558 ++++++++++++++++++ docker/telemetry/otel-collector-config.yaml | 31 +- docker/telemetry/prometheus.yml | 9 + docker/telemetry/xrpld-telemetry.cfg | 60 ++ docs/telemetry-runbook.md | 232 ++++++++ include/xrpl/proto/xrpl.proto | 4 + .../xrpl/telemetry/TraceContextPropagator.h | 7 + src/libxrpl/telemetry/Telemetry.cpp | 7 + src/xrpld/app/consensus/RCLConsensus.cpp | 8 +- src/xrpld/app/main/Application.cpp | 4 + src/xrpld/consensus/ConsensusTypes.h | 20 + 25 files changed, 2387 insertions(+), 24 deletions(-) create mode 100644 OpenTelemetryPlan/Phase5_IntegrationTest_taskList.md create mode 100644 docker/telemetry/.gitignore create mode 100644 docker/telemetry/TESTING.md create mode 100644 docker/telemetry/grafana/dashboards/consensus-health.json create mode 100644 docker/telemetry/grafana/dashboards/rpc-performance.json create mode 100644 docker/telemetry/grafana/dashboards/transaction-overview.json create mode 100644 docker/telemetry/grafana/provisioning/dashboards/dashboards.yaml create mode 100644 docker/telemetry/grafana/provisioning/datasources/prometheus.yaml create mode 100755 docker/telemetry/integration-test.sh create mode 100644 docker/telemetry/prometheus.yml create mode 100644 docker/telemetry/xrpld-telemetry.cfg create mode 100644 docs/telemetry-runbook.md diff --git a/OpenTelemetryPlan/08-appendix.md b/OpenTelemetryPlan/08-appendix.md index f7e014d215..742d8a9bf5 100644 --- a/OpenTelemetryPlan/08-appendix.md +++ b/OpenTelemetryPlan/08-appendix.md @@ -186,14 +186,15 @@ flowchart TB ### Task Lists -| Document | Description | -| ------------------------------------------ | --------------------------------------------------- | -| [POC_taskList.md](./POC_taskList.md) | Proof-of-concept telemetry integration | -| [Phase2_taskList.md](./Phase2_taskList.md) | RPC layer trace instrumentation | -| [Phase3_taskList.md](./Phase3_taskList.md) | Peer overlay & consensus tracing | -| [Phase4_taskList.md](./Phase4_taskList.md) | Transaction lifecycle tracing | -| [Phase5_taskList.md](./Phase5_taskList.md) | Ledger processing & advanced tracing | -| [presentation.md](./presentation.md) | Presentation slides for OpenTelemetry plan overview | +| Document | Description | +| -------------------------------------------------------------------------- | --------------------------------------------------- | +| [POC_taskList.md](./POC_taskList.md) | Proof-of-concept telemetry integration | +| [Phase2_taskList.md](./Phase2_taskList.md) | RPC layer trace instrumentation | +| [Phase3_taskList.md](./Phase3_taskList.md) | Peer overlay & consensus tracing | +| [Phase4_taskList.md](./Phase4_taskList.md) | Transaction lifecycle tracing | +| [Phase5_taskList.md](./Phase5_taskList.md) | Ledger processing & advanced tracing | +| [Phase5_IntegrationTest_taskList.md](./Phase5_IntegrationTest_taskList.md) | Observability stack integration tests | +| [presentation.md](./presentation.md) | Presentation slides for OpenTelemetry plan overview | --- diff --git a/OpenTelemetryPlan/Phase2_taskList.md b/OpenTelemetryPlan/Phase2_taskList.md index 249be880ff..6e7da16a6a 100644 --- a/OpenTelemetryPlan/Phase2_taskList.md +++ b/OpenTelemetryPlan/Phase2_taskList.md @@ -204,3 +204,36 @@ This surfaces all RPCs served during a blocked period — critical for post-inci **Delivered in this branch**: Tasks 2.4, 2.7, 2.8, 2.9. **Deferred with rationale**: Tasks 2.1 (→Phase 3), 2.5 (low priority). **Superseded**: Task 2.2 (Phase 1c SpanGuard factory covers this). + +--- + +## Known Issues / Future Work + +### Thread safety of TelemetryImpl::stop() vs startSpan() + +`TelemetryImpl::stop()` resets `sdkProvider_` (a `std::shared_ptr`) without +synchronization. `getTracer()` reads the same member from RPC handler threads. +This is a data race if any thread calls `startSpan()` concurrently with `stop()`. + +**Current mitigation**: `Application::stop()` shuts down `serverHandler_`, +`overlay_`, and `jobQueue_` before calling `telemetry_->stop()`, so no callers +remain. See comments in `Telemetry.cpp:stop()` and `Application.cpp`. + +**TODO**: Add an `std::atomic stopped_` flag checked in `getTracer()` to +make this robust against future shutdown order changes. + +### Macro incompatibility: XRPL_TRACE_SPAN vs XRPL_TRACE_SET_ATTR + +`XRPL_TRACE_SPAN` and `XRPL_TRACE_SPAN_KIND` declare `_xrpl_guard_` as a bare +`SpanGuard`, but `XRPL_TRACE_SET_ATTR` and `XRPL_TRACE_EXCEPTION` call +`_xrpl_guard_.has_value()` which requires `std::optional`. Using +`XRPL_TRACE_SPAN` followed by `XRPL_TRACE_SET_ATTR` in the same scope would +fail to compile. + +**Current mitigation**: No call site currently uses `XRPL_TRACE_SPAN` — all +production code uses the conditional macros (`XRPL_TRACE_RPC`, `XRPL_TRACE_TX`, +etc.) which correctly wrap the guard in `std::optional`. + +**TODO**: Either make `XRPL_TRACE_SPAN`/`XRPL_TRACE_SPAN_KIND` also wrap in +`std::optional`, or document that `XRPL_TRACE_SET_ATTR` is only compatible with +the conditional macros. diff --git a/OpenTelemetryPlan/Phase3_taskList.md b/OpenTelemetryPlan/Phase3_taskList.md index 02efb4f442..4e95ee5661 100644 --- a/OpenTelemetryPlan/Phase3_taskList.md +++ b/OpenTelemetryPlan/Phase3_taskList.md @@ -443,3 +443,28 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ - [ ] <5% overhead on transaction throughput - [ ] Deterministic trace_id: same trace_id for same tx across all nodes - [ ] Protobuf span_id propagation preserves parent-child ordering when available + +--- + +## Known Issues / Future Work + +### Propagation utilities not yet wired into P2P flow + +`extractFromProtobuf()` and `injectToProtobuf()` in `TraceContextPropagator.h` +are implemented and tested but not called from production code. To enable +cross-node distributed traces: + +- Call `injectToProtobuf()` in `PeerImp` when sending `TMTransaction` / + `TMProposeSet` messages +- Call `extractFromProtobuf()` in the corresponding message handlers to + reconstruct the parent span context, then pass it to `startSpan()` as the + parent + +This was deferred to validate single-node tracing performance first. + +### Unused trace_state proto field + +The `TraceContext.trace_state` field (field 4) in `xrpl.proto` is reserved for +W3C `tracestate` vendor-specific key-value pairs but is not read or written by +`TraceContextPropagator`. Wire it when cross-vendor trace propagation is needed. +No wire cost since proto `optional` fields are zero-cost when absent. diff --git a/OpenTelemetryPlan/Phase5_IntegrationTest_taskList.md b/OpenTelemetryPlan/Phase5_IntegrationTest_taskList.md new file mode 100644 index 0000000000..28f45afc75 --- /dev/null +++ b/OpenTelemetryPlan/Phase5_IntegrationTest_taskList.md @@ -0,0 +1,221 @@ +# Phase 5: Integration Test Task List + +> **Goal**: End-to-end verification of the complete telemetry pipeline using a +> 6-node consensus network. Proves that RPC, transaction, and consensus spans +> flow through the observability stack (otel-collector, Tempo, Prometheus, +> Grafana) under realistic conditions. +> +> **Scope**: Integration test script, manual testing plan, 6-node local network +> setup, Tempo/Prometheus/Grafana verification. +> +> **Branch**: `pratik/otel-phase5-docs-deployment` + +### Related Plan Documents + +| Document | Relevance | +| ---------------------------------------------------------------- | ------------------------------------------ | +| [07-observability-backends.md](./07-observability-backends.md) | Tempo, Grafana, Prometheus setup | +| [05-configuration-reference.md](./05-configuration-reference.md) | Collector config, Docker Compose | +| [06-implementation-phases.md](./06-implementation-phases.md) | Phase 5 tasks, definition of done | +| [Phase5_taskList.md](./Phase5_taskList.md) | Phase 5 main task list (5.6 = integration) | + +--- + +## Task IT.1: Create Integration Test Script + +**Objective**: Automated bash script that stands up a 6-node xrpld network +with telemetry, exercises all span categories, and verifies data in +Tempo/Prometheus. + +**What to do**: + +- Create `docker/telemetry/integration-test.sh`: + - Prerequisites check (docker, xrpld binary, curl, jq) + - Start observability stack via `docker compose` + - Generate 6 validator key pairs via temp standalone xrpld + - Generate 6 node configs + shared `validators.txt` + - Start 6 xrpld nodes in consensus mode (`--start`, no `-a`) + - Wait for all nodes to reach `"proposing"` state (120s timeout) + +**Key new file**: `docker/telemetry/integration-test.sh` + +**Verification**: + +- [ ] Script starts without errors +- [ ] All 6 nodes reach "proposing" state +- [ ] Observability stack is healthy (otel-collector, Tempo, Prometheus, Grafana) + +--- + +## Task IT.2: RPC Span Verification (Phase 2) + +**Objective**: Verify RPC spans flow through the telemetry pipeline. + +**What to do**: + +- Send `server_info`, `server_state`, `ledger` RPCs to node1 (port 5005) +- Wait for batch export (5s) +- Query Tempo API for: + - `rpc.request` spans (ServerHandler::onRequest) + - `rpc.process` spans (ServerHandler::processRequest) + - `rpc.command.server_info` spans (callMethod) + - `rpc.command.server_state` spans (callMethod) + - `rpc.command.ledger` spans (callMethod) +- Verify `xrpl.rpc.command` attribute present on `rpc.command.*` spans + +**Verification**: + +- [ ] Tempo shows `rpc.request` traces +- [ ] Tempo shows `rpc.process` traces +- [ ] Tempo shows `rpc.command.*` traces with correct attributes + +--- + +## Task IT.3: Transaction Span Verification (Phase 3) + +**Objective**: Verify transaction spans flow through the telemetry pipeline. + +**What to do**: + +- Get genesis account sequence via `account_info` RPC +- Submit Payment transaction using genesis seed (`snoPBrXtMeMyMHUVTgbuqAfg1SUTb`) +- Wait for consensus inclusion (10s) +- Query Tempo API for: + - `tx.process` spans (NetworkOPsImp::processTransaction) on submitting node + - `tx.receive` spans (PeerImp::handleTransaction) on peer nodes +- Verify `xrpl.tx.hash` attribute on `tx.process` spans +- Verify `xrpl.peer.id` attribute on `tx.receive` spans + +**Verification**: + +- [ ] Tempo shows `tx.process` traces with `xrpl.tx.hash` +- [ ] Tempo shows `tx.receive` traces with `xrpl.peer.id` + +--- + +## Task IT.4: Consensus Span Verification (Phase 4) + +**Objective**: Verify consensus spans flow through the telemetry pipeline. + +**What to do**: + +- Consensus runs automatically in 6-node network +- Query Tempo API for: + - `consensus.proposal.send` (Adaptor::propose) + - `consensus.ledger_close` (Adaptor::onClose) + - `consensus.accept` (Adaptor::onAccept) + - `consensus.validation.send` (Adaptor::validate) +- Verify attributes: + - `xrpl.consensus.mode` on `consensus.ledger_close` + - `xrpl.consensus.proposers` on `consensus.accept` + - `xrpl.consensus.ledger.seq` on `consensus.validation.send` + +**Verification**: + +- [ ] Tempo shows `consensus.ledger_close` traces with `xrpl.consensus.mode` +- [ ] Tempo shows `consensus.accept` traces with `xrpl.consensus.proposers` +- [ ] Tempo shows `consensus.proposal.send` traces +- [ ] Tempo shows `consensus.validation.send` traces + +--- + +## Task IT.5: Spanmetrics Verification (Phase 5) + +**Objective**: Verify spanmetrics connector derives RED metrics from spans. + +**What to do**: + +- Query Prometheus for `traces_span_metrics_calls_total` +- Query Prometheus for `traces_span_metrics_duration_milliseconds_count` +- Verify Grafana loads at `http://localhost:3000` + +**Verification**: + +- [ ] Prometheus returns non-empty results for `traces_span_metrics_calls_total` +- [ ] Prometheus returns non-empty results for duration histogram +- [ ] Grafana UI accessible with dashboards visible + +--- + +## Task IT.6: Manual Testing Plan + +**Objective**: Document how to run tests manually for future reference. + +**What to do**: + +- Create `docker/telemetry/TESTING.md` with: + - Prerequisites section + - Single-node standalone test (quick verification) + - 6-node consensus test (full verification) + - Expected span catalog (all 12 span names with attributes) + - Verification queries (Tempo API, Prometheus API) + - Troubleshooting guide + +**Key new file**: `docker/telemetry/TESTING.md` + +**Verification**: + +- [ ] Document covers both single-node and multi-node testing +- [ ] All 12 span names documented with source file and attributes +- [ ] Troubleshooting section covers common failure modes + +--- + +## Task IT.7: Run and Verify + +**Objective**: Execute the integration test and validate results. + +**What to do**: + +- Run `docker/telemetry/integration-test.sh` locally +- Debug any failures +- Leave stack running for manual verification +- Share URLs: + - Tempo: `http://localhost:3200` + - Grafana: `http://localhost:3000` + - Prometheus: `http://localhost:9090` + +**Verification**: + +- [ ] Script completes with all checks passing +- [ ] Tempo UI shows rippled service with all expected span names +- [ ] Grafana dashboards load and show data + +--- + +## Task IT.8: Commit + +**Objective**: Commit all new files to Phase 5 branch. + +**What to do**: + +- Run `pcc` (pre-commit checks) +- Commit 3 new files to `pratik/otel-phase5-docs-deployment` + +**Verification**: + +- [ ] `pcc` passes +- [ ] Commit created on Phase 5 branch + +--- + +## Summary + +| Task | Description | New Files | Depends On | +| ---- | ----------------------------- | --------- | ---------- | +| IT.1 | Integration test script | 1 | Phase 5 | +| IT.2 | RPC span verification | 0 | IT.1 | +| IT.3 | Transaction span verification | 0 | IT.1 | +| IT.4 | Consensus span verification | 0 | IT.1 | +| IT.5 | Spanmetrics verification | 0 | IT.1 | +| IT.6 | Manual testing plan | 1 | -- | +| IT.7 | Run and verify | 0 | IT.1-IT.6 | +| IT.8 | Commit | 0 | IT.7 | + +**Exit Criteria**: + +- [ ] All 6 xrpld nodes reach "proposing" state +- [ ] All 11 expected span names visible in Tempo +- [ ] Spanmetrics available in Prometheus +- [ ] Grafana dashboards show data +- [ ] Manual testing plan document complete diff --git a/cspell.config.yaml b/cspell.config.yaml index b9af25a112..574a7d1426 100644 --- a/cspell.config.yaml +++ b/cspell.config.yaml @@ -102,6 +102,7 @@ words: - dxrpl - enabled - endmacro + - EOCFG - exceptioned - Falco - fcontext @@ -203,6 +204,8 @@ words: - permdex - perminute - permissioned + - pgrep + - pkill - pimpl - pointee - populator @@ -222,6 +225,7 @@ words: - Raphson - reparent - replayer + - reqps - rerere - retriable - RIPD @@ -319,6 +323,10 @@ words: - xchain - ximinez - EXPECT_STREQ + - Gantt + - gantt + - otelc + - traceql - XMACRO - xrpkuwait - xrpl @@ -327,10 +335,6 @@ words: - xxhash - xxhasher - xychart - - otelc - zpages - - traceql - - Gantt - - gantt - pratik - dedup diff --git a/docker/telemetry/.gitignore b/docker/telemetry/.gitignore new file mode 100644 index 0000000000..bba4819d66 --- /dev/null +++ b/docker/telemetry/.gitignore @@ -0,0 +1,2 @@ +# Runtime data generated by xrpld and telemetry stack +data/ diff --git a/docker/telemetry/TESTING.md b/docker/telemetry/TESTING.md new file mode 100644 index 0000000000..be36f91d32 --- /dev/null +++ b/docker/telemetry/TESTING.md @@ -0,0 +1,512 @@ +# OpenTelemetry Integration Testing Guide + +This document describes how to verify the rippled OpenTelemetry telemetry +pipeline end-to-end, from span generation through the observability stack +(otel-collector, Tempo, Prometheus, Grafana). + +--- + +## Prerequisites + +### Build xrpld with telemetry + +```bash +conan install . --build=missing -o telemetry=True +cmake --preset default -Dtelemetry=ON +cmake --build --preset default --target xrpld +``` + +The binary is at `.build/xrpld`. + +### Required tools + +- **Docker** with `docker compose` (v2) +- **curl** +- **jq** (JSON processor) + +### Verify binary + +```bash +.build/xrpld --version +``` + +--- + +## Test 1: Single-Node Standalone (Quick Verification) + +This test verifies RPC and transaction spans in standalone mode. Consensus +spans will not fire because standalone mode does not run consensus. + +### Step 1: Start the observability stack + +```bash +docker compose -f docker/telemetry/docker-compose.yml up -d +``` + +Wait for services to be ready: + +```bash +# otel-collector health +curl -sf http://localhost:13133/ && echo "collector ready" + +# Tempo readiness +curl -sf http://localhost:3200/ready > /dev/null && echo "tempo ready" +``` + +### Step 2: Start xrpld in standalone mode + +```bash +.build/xrpld --conf docker/telemetry/xrpld-telemetry.cfg -a --start +``` + +Wait a few seconds for the node to initialize. + +### Step 3: Exercise RPC spans + +```bash +# server_info +curl -s http://localhost:5005 \ + -d '{"method":"server_info"}' | jq .result.info.server_state + +# server_state +curl -s http://localhost:5005 \ + -d '{"method":"server_state"}' | jq .result.state.server_state + +# ledger +curl -s http://localhost:5005 \ + -d '{"method":"ledger","params":[{"ledger_index":"current"}]}' \ + | jq .result.ledger_current_index +``` + +### Step 4: Submit a transaction + +Close the ledger first (required in standalone mode): + +```bash +curl -s http://localhost:5005 -d '{"method":"ledger_accept"}' +``` + +Submit a Payment from the genesis account: + +```bash +curl -s http://localhost:5005 -d '{ + "method": "submit", + "params": [{ + "secret": "snoPBrXtMeMyMHUVTgbuqAfg1SUTb", + "tx_json": { + "TransactionType": "Payment", + "Account": "rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh", + "Destination": "rPMh7Pi9ct699iZUTWzJaUMR1o42VEfGqF", + "Amount": "10000000" + } + }] +}' | jq .result.engine_result +``` + +Expected result: `"tesSUCCESS"`. + +Close the ledger again to finalize: + +```bash +curl -s http://localhost:5005 -d '{"method":"ledger_accept"}' +``` + +### Step 5: Verify traces in Tempo + +Wait 5 seconds for the batch export, then: + +```bash +TEMPO="http://localhost:3200" + +# Check rippled service is registered +curl -s "$TEMPO/api/v2/search/tag/resource.service.name/values" | jq '.tagValues[].value' + +# Check RPC spans +curl -s "$TEMPO/api/search" \ + --data-urlencode 'q={resource.service.name="rippled" && name="rpc.request"}' \ + --data-urlencode 'limit=5' | jq '.traces | length' + +curl -s "$TEMPO/api/search" \ + --data-urlencode 'q={resource.service.name="rippled" && name="rpc.process"}' \ + --data-urlencode 'limit=5' | jq '.traces | length' + +curl -s "$TEMPO/api/search" \ + --data-urlencode 'q={resource.service.name="rippled" && name="rpc.command.server_info"}' \ + --data-urlencode 'limit=5' | jq '.traces | length' + +# Check transaction spans +curl -s "$TEMPO/api/search" \ + --data-urlencode 'q={resource.service.name="rippled" && name="tx.process"}' \ + --data-urlencode 'limit=5' | jq '.traces | length' +``` + +Or open Grafana Explore with Tempo datasource: http://localhost:3000 + +### Step 6: Teardown + +```bash +# Kill xrpld (Ctrl+C or) +kill $(pgrep -f 'xrpld.*xrpld-telemetry') + +# Stop observability stack +docker compose -f docker/telemetry/docker-compose.yml down + +# Clean xrpld data +rm -rf data/ +``` + +### Expected spans (standalone mode) + +| Span Name | Expected | Notes | +| --------------------------- | -------- | ----------------------------- | +| `rpc.request` | Yes | Every HTTP RPC call | +| `rpc.process` | Yes | Every RPC processing | +| `rpc.command.server_info` | Yes | server_info RPC | +| `rpc.command.server_state` | Yes | server_state RPC | +| `rpc.command.ledger` | Yes | ledger RPC | +| `rpc.command.submit` | Yes | submit RPC | +| `rpc.command.ledger_accept` | Yes | ledger_accept RPC | +| `tx.process` | Yes | Transaction submission | +| `tx.receive` | No | No peers in standalone | +| `consensus.*` | No | Consensus disabled standalone | + +--- + +## Test 2: 6-Node Consensus Network (Full Verification) + +This test verifies ALL span categories including consensus and peer +transaction relay, using a 6-node validator network. + +### Automated + +Run the integration test script: + +```bash +bash docker/telemetry/integration-test.sh +``` + +The script will: + +1. Start the observability stack +2. Generate 6 validator key pairs +3. Create config files for each node +4. Start all 6 nodes +5. Wait for consensus ("proposing" state) +6. Exercise RPC, submit transactions +7. Verify all span categories in Tempo +8. Verify spanmetrics in Prometheus +9. Print results and leave the stack running + +### Manual + +If you prefer to run the steps manually: + +#### Step 1: Start observability stack + +```bash +docker compose -f docker/telemetry/docker-compose.yml up -d +``` + +#### Step 2: Generate validator keys + +Start a temporary standalone xrpld: + +```bash +.build/xrpld --conf docker/telemetry/xrpld-telemetry.cfg -a --start & +TEMP_PID=$! +sleep 5 +``` + +Generate 6 key pairs: + +```bash +for i in $(seq 1 6); do + curl -s http://localhost:5005 \ + -d '{"method":"validation_create"}' | jq '.result' +done +``` + +Record the `validation_seed` and `validation_public_key` for each. +Kill the temporary node: + +```bash +kill $TEMP_PID +rm -rf data/ +``` + +#### Step 3: Create node configs + +For each node (1-6), create a config file. Template: + +```ini +[server] +port_rpc +port_peer + +[port_rpc] +port = {5004 + node_number} +ip = 127.0.0.1 +admin = 127.0.0.1 +protocol = http + +[port_peer] +port = {51234 + node_number} +ip = 0.0.0.0 +protocol = peer + +[node_db] +type=NuDB +path=/tmp/xrpld-integration/node{N}/nudb +online_delete=256 + +[database_path] +/tmp/xrpld-integration/node{N}/db + +[debug_logfile] +/tmp/xrpld-integration/node{N}/debug.log + +[validation_seed] +{seed from step 2} + +[validators_file] +/tmp/xrpld-integration/validators.txt + +[ips_fixed] +127.0.0.1 51235 +127.0.0.1 51236 +127.0.0.1 51237 +127.0.0.1 51238 +127.0.0.1 51239 +127.0.0.1 51240 + +[peer_private] +1 + +[telemetry] +enabled=1 +endpoint=http://localhost:4318/v1/traces +exporter=otlp_http +sampling_ratio=1.0 +batch_size=512 +batch_delay_ms=2000 +max_queue_size=2048 +trace_rpc=1 +trace_transactions=1 +trace_consensus=1 +trace_peer=0 +trace_ledger=1 + +[rpc_startup] +{ "command": "log_level", "severity": "warning" } + +[ssl_verify] +0 +``` + +#### Step 4: Create validators.txt + +```ini +[validators] +{public_key_1} +{public_key_2} +{public_key_3} +{public_key_4} +{public_key_5} +{public_key_6} +``` + +#### Step 5: Start all 6 nodes + +```bash +for i in $(seq 1 6); do + .build/xrpld --conf /tmp/xrpld-integration/node$i/xrpld.cfg --start & + echo $! > /tmp/xrpld-integration/node$i/xrpld.pid +done +``` + +#### Step 6: Wait for consensus + +Poll each node until `server_state` = `"proposing"`: + +```bash +for port in 5005 5006 5007 5008 5009 5010; do + while true; do + state=$(curl -s http://localhost:$port \ + -d '{"method":"server_info"}' \ + | jq -r '.result.info.server_state') + echo "Port $port: $state" + [ "$state" = "proposing" ] && break + sleep 5 + done +done +``` + +#### Step 7: Exercise RPC and submit transaction + +```bash +# RPC calls +curl -s http://localhost:5005 -d '{"method":"server_info"}' +curl -s http://localhost:5005 -d '{"method":"server_state"}' +curl -s http://localhost:5005 -d '{"method":"ledger","params":[{"ledger_index":"current"}]}' + +# Submit transaction +curl -s http://localhost:5005 -d '{ + "method": "submit", + "params": [{ + "secret": "snoPBrXtMeMyMHUVTgbuqAfg1SUTb", + "tx_json": { + "TransactionType": "Payment", + "Account": "rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh", + "Destination": "rPMh7Pi9ct699iZUTWzJaUMR1o42VEfGqF", + "Amount": "10000000" + } + }] +}' +``` + +Wait 15 seconds for consensus and batch export. + +#### Step 8: Verify in Tempo + +See the "Verification Queries" section below. + +--- + +## Expected Span Catalog + +All 12 production span names instrumented across Phases 2-4: + +| Span Name | Source File | Phase | Key Attributes | How to Trigger | +| --------------------------- | --------------------- | ----- | --------------------------------------------------------------------------------- | ------------------------- | +| `rpc.request` | ServerHandler.cpp:271 | 2 | -- | Any HTTP RPC call | +| `rpc.process` | ServerHandler.cpp:573 | 2 | -- | Any HTTP RPC call | +| `rpc.ws_message` | ServerHandler.cpp:384 | 2 | -- | WebSocket RPC message | +| `rpc.command.` | RPCHandler.cpp:161 | 2 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role` | Any RPC command | +| `tx.process` | NetworkOPs.cpp:1227 | 3 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Submit transaction | +| `tx.receive` | PeerImp.cpp:1273 | 3 | `xrpl.peer.id` | Peer relays transaction | +| `consensus.proposal.send` | RCLConsensus.cpp:177 | 4 | `xrpl.consensus.round` | Consensus proposing phase | +| `consensus.ledger_close` | RCLConsensus.cpp:282 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | +| `consensus.accept` | RCLConsensus.cpp:395 | 4 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted | +| `consensus.validation.send` | RCLConsensus.cpp:753 | 4 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent | +| `consensus.accept.apply` | RCLConsensus.cpp:453 | 4 | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state` | Ledger apply + close time | + +--- + +## Verification Queries + +### Tempo API + +Base URL: `http://localhost:3200` + +```bash +TEMPO="http://localhost:3200" + +# List all services +curl -s "$TEMPO/api/v2/search/tag/resource.service.name/values" | jq '.tagValues[].value' + +# Query traces by operation +for op in "rpc.request" "rpc.process" \ + "rpc.command.server_info" "rpc.command.server_state" "rpc.command.ledger" \ + "tx.process" "tx.receive" \ + "consensus.proposal.send" "consensus.ledger_close" \ + "consensus.accept" "consensus.accept.apply" \ + "consensus.validation.send"; do + count=$(curl -s "$TEMPO/api/search" \ + --data-urlencode "q={resource.service.name=\"rippled\" && name=\"$op\"}" \ + --data-urlencode "limit=5" \ + | jq '.traces | length') + printf "%-35s %s traces\n" "$op" "$count" +done +``` + +### Prometheus API + +Base URL: `http://localhost:9090` + +```bash +PROM="http://localhost:9090" + +# Span call counts (from spanmetrics connector) +curl -s "$PROM/api/v1/query?query=traces_span_metrics_calls_total" \ + | jq '.data.result[] | {span: .metric.span_name, count: .value[1]}' + +# Latency histogram +curl -s "$PROM/api/v1/query?query=traces_span_metrics_duration_milliseconds_count" \ + | jq '.data.result[] | {span: .metric.span_name, count: .value[1]}' + +# RPC calls by command +curl -s "$PROM/api/v1/query?query=traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}" \ + | jq '.data.result[] | {command: .metric["xrpl.rpc.command"], count: .value[1]}' +``` + +### Grafana + +Open http://localhost:3000 (anonymous admin access enabled). + +Pre-configured dashboards: + +- **RPC Performance**: Request rates, latency percentiles by command +- **Transaction Overview**: Transaction processing rates and paths +- **Consensus Health**: Consensus round duration and proposer counts + +Pre-configured datasources: + +- **Tempo**: Trace data at `http://tempo:3200` +- **Prometheus**: Metrics at `http://prometheus:9090` + +--- + +## Troubleshooting + +### No traces in Tempo + +1. Check otel-collector logs: + ```bash + docker compose -f docker/telemetry/docker-compose.yml logs otel-collector + ``` +2. Verify xrpld telemetry config has `enabled=1` and correct endpoint +3. Check that otel-collector port 4318 is accessible: + ```bash + curl -sf http://localhost:4318 && echo "reachable" + ``` +4. Increase `batch_delay_ms` or decrease `batch_size` in xrpld config + +### Nodes not reaching "proposing" state + +1. Check that all peer ports (51235-51240) are not in use: + ```bash + for p in 51235 51236 51237 51238 51239 51240; do + ss -tlnp | grep ":$p " && echo "port $p in use" + done + ``` +2. Verify `[ips_fixed]` lists all 6 peer ports +3. Verify `validators.txt` has all 6 public keys +4. Check node debug logs: `tail -50 /tmp/xrpld-integration/node1/debug.log` +5. Ensure `[peer_private]` is set to `1` (prevents reaching out to public network) + +### Transaction not processing + +1. Verify genesis account exists: + ```bash + curl -s http://localhost:5005 \ + -d '{"method":"account_info","params":[{"account":"rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh"}]}' \ + | jq .result.account_data.Balance + ``` +2. Check submit response for error codes +3. In standalone mode, remember to call `ledger_accept` after submitting + +### Spanmetrics not appearing in Prometheus + +1. Verify otel-collector config has `spanmetrics` connector +2. Check that the metrics pipeline is configured: + ```yaml + service: + pipelines: + metrics: + receivers: [spanmetrics] + exporters: [prometheus] + ``` +3. Verify Prometheus can reach collector: + ```bash + curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets' + ``` diff --git a/docker/telemetry/docker-compose.yml b/docker/telemetry/docker-compose.yml index bf74dad9be..caf84b9767 100644 --- a/docker/telemetry/docker-compose.yml +++ b/docker/telemetry/docker-compose.yml @@ -7,7 +7,7 @@ # - tempo: Grafana Tempo tracing backend, queryable via Grafana Explore # on port 3000. Recommended for production (S3/GCS storage, TraceQL). # - grafana: dashboards on port 3000, pre-configured with Tempo -# datasource. +# and Prometheus datasources. # # Usage: # docker compose -f docker/telemetry/docker-compose.yml up -d @@ -26,6 +26,7 @@ services: ports: - "4317:4317" # OTLP gRPC receiver - "4318:4318" # OTLP HTTP receiver (xrpld sends traces here) + - "8889:8889" # Prometheus metrics (spanmetrics) - "13133:13133" # Health check endpoint volumes: # Mount collector pipeline config (receivers → processors → exporters) @@ -50,6 +51,17 @@ services: networks: - xrpld-telemetry + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + depends_on: + - otel-collector + networks: + - xrpld-telemetry + # Grafana: visualization UI with Tempo pre-configured as a datasource. # Anonymous admin access enabled for local development convenience. grafana: @@ -62,8 +74,10 @@ services: volumes: # Auto-provision Tempo datasource and search filters on startup - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro depends_on: - tempo + - prometheus networks: - xrpld-telemetry diff --git a/docker/telemetry/grafana/dashboards/consensus-health.json b/docker/telemetry/grafana/dashboards/consensus-health.json new file mode 100644 index 0000000000..ef202e7353 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/consensus-health.json @@ -0,0 +1,244 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Consensus Round Duration", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept\"}[5m])))", + "legendFormat": "P95 Round Duration" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept\"}[5m])))", + "legendFormat": "P50 Round Duration" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms" + }, + "overrides": [] + } + }, + { + "title": "Consensus Proposals Sent Rate", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.proposal.send\"}[5m]))", + "legendFormat": "Proposals / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Ledger Close Duration", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.ledger_close\"}[5m])))", + "legendFormat": "P95 Close Duration" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms" + }, + "overrides": [] + } + }, + { + "title": "Validation Send Rate", + "type": "stat", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.validation.send\"}[5m]))", + "legendFormat": "Validations / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Ledger Apply Duration (doAccept)", + "description": "Time spent applying the consensus result to build a new ledger. Measured by the consensus.accept.apply span in doAccept().", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))", + "legendFormat": "P95 Apply Duration" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\"}[5m])))", + "legendFormat": "P50 Apply Duration" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Duration (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "Close Time Agreement", + "description": "Rate of close time agreement vs disagreement across consensus rounds. Based on xrpl.consensus.close_time_correct attribute (true = validators agreed, false = agreed to disagree per avCT_CONSENSUS_PCT).", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"consensus.accept.apply\"}[5m]))", + "legendFormat": "Total Rounds / Sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "axisLabel": "Rounds / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "consensus", "telemetry"], + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id \u2014 e.g. Node-1)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total, exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "consensus_mode", + "label": "Consensus Mode", + "description": "Filter by consensus mode (Proposing, Observing, Wrong Ledger, Switched Ledger)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"consensus.ledger_close\"}, xrpl_consensus_mode)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "rippled Consensus Health", + "uid": "rippled-consensus" +} diff --git a/docker/telemetry/grafana/dashboards/rpc-performance.json b/docker/telemetry/grafana/dashboards/rpc-performance.json new file mode 100644 index 0000000000..99cfe82699 --- /dev/null +++ b/docker/telemetry/grafana/dashboards/rpc-performance.json @@ -0,0 +1,189 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "RPC Request Rate by Command", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", exported_instance=~\"$node\", span_name=~\"rpc.command.*\"}[5m]))", + "legendFormat": "{{xrpl_rpc_command}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "axisLabel": "Requests / Sec", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "RPC Latency P95 by Command", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", exported_instance=~\"$node\", span_name=~\"rpc.command.*\"}[5m])))", + "legendFormat": "P95 {{xrpl_rpc_command}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "axisLabel": "Latency (ms)", + "spanNulls": true, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 3 + } + }, + "overrides": [] + } + }, + { + "title": "RPC Error Rate", + "type": "bargauge", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{xrpl_rpc_command=~\"$command\", exported_instance=~\"$node\", span_name=~\"rpc.command.*\", status_code=\"STATUS_CODE_ERROR\"}[5m])) / sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_rpc_command=~\"$command\", span_name=~\"rpc.command.*\"}[5m])) * 100", + "legendFormat": "{{xrpl_rpc_command}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + } + }, + { + "title": "RPC Latency Heatmap", + "type": "heatmap", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(increase(traces_span_metrics_duration_milliseconds_bucket{xrpl_rpc_command=~\"$command\", exported_instance=~\"$node\", span_name=~\"rpc.command.*\"}[5m])) by (le)", + "legendFormat": "{{le}}", + "format": "heatmap" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["rippled", "rpc", "telemetry"], + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id \u2014 e.g. Node-1)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total, exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "command", + "label": "RPC Command", + "description": "Filter by RPC command name (e.g., server_info, submit)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total{span_name=~\"rpc.command.*\"}, xrpl_rpc_command)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "rippled RPC Performance", + "uid": "rippled-rpc-perf" +} diff --git a/docker/telemetry/grafana/dashboards/transaction-overview.json b/docker/telemetry/grafana/dashboards/transaction-overview.json new file mode 100644 index 0000000000..b5f008972f --- /dev/null +++ b/docker/telemetry/grafana/dashboards/transaction-overview.json @@ -0,0 +1,172 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Transaction Processing Rate", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.process\"}[5m]))", + "legendFormat": "tx.process/sec" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.receive\"}[5m]))", + "legendFormat": "tx.receive/sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + }, + { + "title": "Transaction Processing Latency", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\"}[5m])))", + "legendFormat": "p95" + }, + { + "datasource": { + "type": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{exported_instance=~\"$node\", span_name=\"tx.process\"}[5m])))", + "legendFormat": "p50" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms" + }, + "overrides": [] + } + }, + { + "title": "Transaction Path Distribution", + "type": "piechart", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", xrpl_tx_local=~\"$tx_origin\", span_name=\"tx.process\"}[5m]))", + "legendFormat": "local={{xrpl_tx_local}}" + } + ] + }, + { + "title": "Transaction Receive vs Suppressed", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "expr": "sum(rate(traces_span_metrics_calls_total{exported_instance=~\"$node\", span_name=\"tx.receive\"}[5m]))", + "legendFormat": "total received" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + } + } + ], + "schemaVersion": 39, + "tags": ["rippled", "transactions", "telemetry"], + "templating": { + "list": [ + { + "name": "node", + "label": "Node", + "description": "Filter by rippled node (service.instance.id \u2014 e.g. Node-1)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total, exported_instance)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + }, + { + "name": "tx_origin", + "label": "TX Origin", + "description": "Filter by transaction origin (true = local submit, false = peer relay)", + "type": "query", + "query": "label_values(traces_span_metrics_calls_total{span_name=\"tx.process\"}, xrpl_tx_local)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "includeAll": true, + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "multi": true, + "refresh": 2, + "sort": 1 + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "title": "rippled Transaction Overview", + "uid": "rippled-transactions" +} diff --git a/docker/telemetry/grafana/provisioning/dashboards/dashboards.yaml b/docker/telemetry/grafana/provisioning/dashboards/dashboards.yaml new file mode 100644 index 0000000000..6aeaff31e6 --- /dev/null +++ b/docker/telemetry/grafana/provisioning/dashboards/dashboards.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: rippled-telemetry + orgId: 1 + folder: rippled + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/docker/telemetry/grafana/provisioning/datasources/prometheus.yaml b/docker/telemetry/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 0000000000..af7492822a --- /dev/null +++ b/docker/telemetry/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + uid: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true diff --git a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml index 27b6596b0c..45196a3a9a 100644 --- a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml +++ b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml @@ -40,9 +40,9 @@ datasources: operator: "=" scope: resource type: static - # service.instance.id: unique node identifier — defaults to the - # node's public key (e.g., nHB1X37...). Distinguishes individual - # nodes in a multi-node cluster or network. + # service.instance.id: unique node identifier — configurable via + # the service_instance_id setting in [telemetry], defaults to the + # node's public key. E.g. "Node-1" or "nHB1X37...". - id: node-id tag: service.instance.id operator: "=" diff --git a/docker/telemetry/integration-test.sh b/docker/telemetry/integration-test.sh new file mode 100755 index 0000000000..1a48aa324a --- /dev/null +++ b/docker/telemetry/integration-test.sh @@ -0,0 +1,558 @@ +#!/usr/bin/env bash +# Integration test for rippled OpenTelemetry instrumentation. +# +# Launches a 6-node xrpld consensus network with telemetry enabled, +# exercises RPC / transaction / consensus code paths, then verifies +# that the expected spans and metrics appear in Tempo and Prometheus. +# +# Usage: +# bash docker/telemetry/integration-test.sh +# +# Prerequisites: +# - .build/xrpld built with telemetry=ON +# - docker compose (v2) +# - curl, jq +# +# The script leaves the observability stack and xrpld nodes running +# so you can manually inspect Tempo (localhost:3200) and Grafana +# (localhost:3000). Run with --cleanup to tear down instead. + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +XRPLD="$REPO_ROOT/.build/xrpld" +COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yml" +STANDALONE_CFG="$SCRIPT_DIR/xrpld-telemetry.cfg" +WORKDIR="/tmp/xrpld-integration" +NUM_NODES=6 +PEER_PORT_BASE=51235 +RPC_PORT_BASE=5005 +CONSENSUS_TIMEOUT=120 +GENESIS_ACCOUNT="rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh" +GENESIS_SEED="snoPBrXtMeMyMHUVTgbuqAfg1SUTb" +DEST_ACCOUNT="" # Generated dynamically via wallet_propose +TEMPO="http://localhost:3200" +PROM="http://localhost:9090" + +# Counters for pass/fail +PASS=0 +FAIL=0 + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- +log() { printf "\033[1;34m[INFO]\033[0m %s\n" "$*"; } +ok() { printf "\033[1;32m[PASS]\033[0m %s\n" "$*"; PASS=$((PASS + 1)); } +fail() { printf "\033[1;31m[FAIL]\033[0m %s\n" "$*"; FAIL=$((FAIL + 1)); } +die() { printf "\033[1;31m[ERROR]\033[0m %s\n" "$*" >&2; exit 1; } + +check_span() { + local op="$1" + local count + count=$(curl -sf "$TEMPO/api/search" \ + --data-urlencode "q={resource.service.name=\"rippled\" && name=\"$op\"}" \ + --data-urlencode "limit=5" \ + | jq '.traces | length' 2>/dev/null || echo 0) + if [ "$count" -gt 0 ]; then + ok "$op ($count traces)" + else + fail "$op (0 traces)" + fi +} + +cleanup() { + log "Cleaning up..." + # Kill xrpld nodes + for i in $(seq 1 "$NUM_NODES"); do + local pidfile="$WORKDIR/node$i/xrpld.pid" + if [ -f "$pidfile" ]; then + kill "$(cat "$pidfile")" 2>/dev/null || true + rm -f "$pidfile" + fi + done + # Also kill any straggling xrpld processes from our workdir + pkill -f "$WORKDIR" 2>/dev/null || true + # Stop docker stack + docker compose -f "$COMPOSE_FILE" down 2>/dev/null || true + # Remove workdir + rm -rf "$WORKDIR" + log "Cleanup complete." +} + +# Handle --cleanup flag +if [ "${1:-}" = "--cleanup" ]; then + cleanup + exit 0 +fi + +# --------------------------------------------------------------------------- +# Step 0: Prerequisites +# --------------------------------------------------------------------------- +log "Checking prerequisites..." + +command -v docker >/dev/null 2>&1 || die "docker not found" +docker compose version >/dev/null 2>&1 || die "docker compose (v2) not found" +command -v curl >/dev/null 2>&1 || die "curl not found" +command -v jq >/dev/null 2>&1 || die "jq not found" +[ -x "$XRPLD" ] || die "xrpld binary not found at $XRPLD (build with telemetry=ON)" +[ -f "$COMPOSE_FILE" ] || die "docker-compose.yml not found at $COMPOSE_FILE" +[ -f "$STANDALONE_CFG" ] || die "xrpld-telemetry.cfg not found at $STANDALONE_CFG" + +log "All prerequisites met." + +# --------------------------------------------------------------------------- +# Step 1: Clean previous run +# --------------------------------------------------------------------------- +log "Cleaning previous run data..." +for i in $(seq 1 "$NUM_NODES"); do + pidfile="$WORKDIR/node$i/xrpld.pid" + if [ -f "$pidfile" ]; then + kill "$(cat "$pidfile")" 2>/dev/null || true + fi +done +pkill -f "$WORKDIR" 2>/dev/null || true +# Kill any xrpld using the standalone config (from key generation) +pkill -f "xrpld-telemetry.cfg" 2>/dev/null || true +sleep 2 +rm -rf "$WORKDIR" +mkdir -p "$WORKDIR" + +# --------------------------------------------------------------------------- +# Step 2: Start observability stack +# --------------------------------------------------------------------------- +log "Starting observability stack..." +docker compose -f "$COMPOSE_FILE" up -d + +log "Waiting for otel-collector to be ready..." +for attempt in $(seq 1 30); do + # The OTLP HTTP endpoint returns 405 for GET (expects POST), which + # means it is listening. curl -sf would fail on 405, so we check + # the HTTP status code explicitly. + status=$(curl -so /dev/null -w '%{http_code}' http://localhost:4318/ 2>/dev/null || echo 000) + if [ "$status" != "000" ]; then + log "otel-collector ready (attempt $attempt, HTTP $status)." + break + fi + if [ "$attempt" -eq 30 ]; then + die "otel-collector not ready after 30s" + fi + sleep 1 +done + +log "Waiting for Tempo to be ready..." +for attempt in $(seq 1 30); do + if curl -sf "$TEMPO/ready" >/dev/null 2>&1; then + log "Tempo ready (attempt $attempt)." + break + fi + if [ "$attempt" -eq 30 ]; then + die "Tempo not ready after 30s" + fi + sleep 1 +done + +# --------------------------------------------------------------------------- +# Step 3: Generate validator keys +# --------------------------------------------------------------------------- +log "Generating $NUM_NODES validator key pairs..." + +# Start a temporary standalone xrpld for key generation +TEMP_DATA="$WORKDIR/temp-keygen" +mkdir -p "$TEMP_DATA" + +# Create a minimal temp config for key generation +TEMP_CFG="$TEMP_DATA/xrpld.cfg" +cat > "$TEMP_CFG" < "$TEMP_DATA/stdout.log" 2>&1 & +TEMP_PID=$! +log "Temporary xrpld started (PID $TEMP_PID), waiting for RPC..." + +for attempt in $(seq 1 30); do + if curl -sf http://localhost:5099 -d '{"method":"server_info"}' >/dev/null 2>&1; then + log "Temporary xrpld RPC ready (attempt $attempt)." + break + fi + if [ "$attempt" -eq 30 ]; then + kill "$TEMP_PID" 2>/dev/null || true + die "Temporary xrpld RPC not ready after 30s" + fi + sleep 1 +done + +declare -a SEEDS +declare -a PUBKEYS + +for i in $(seq 1 "$NUM_NODES"); do + result=$(curl -sf http://localhost:5099 -d '{"method":"validation_create"}') + seed=$(echo "$result" | jq -r '.result.validation_seed') + pubkey=$(echo "$result" | jq -r '.result.validation_public_key') + if [ -z "$seed" ] || [ "$seed" = "null" ]; then + kill "$TEMP_PID" 2>/dev/null || true + die "Failed to generate key pair $i" + fi + SEEDS+=("$seed") + PUBKEYS+=("$pubkey") + log " Node $i: $pubkey" +done + +kill "$TEMP_PID" 2>/dev/null || true +wait "$TEMP_PID" 2>/dev/null || true +rm -rf "$TEMP_DATA" +log "Key generation complete." + +# --------------------------------------------------------------------------- +# Step 4: Generate node configs and validators.txt +# --------------------------------------------------------------------------- +log "Generating node configs..." + +# Create shared validators.txt +VALIDATORS_FILE="$WORKDIR/validators.txt" +{ + echo "[validators]" + for i in $(seq 0 $((NUM_NODES - 1))); do + echo "${PUBKEYS[$i]}" + done +} > "$VALIDATORS_FILE" + +# Create per-node configs +for i in $(seq 1 "$NUM_NODES"); do + NODE_DIR="$WORKDIR/node$i" + mkdir -p "$NODE_DIR/nudb" "$NODE_DIR/db" + + RPC_PORT=$((RPC_PORT_BASE + i - 1)) + PEER_PORT=$((PEER_PORT_BASE + i - 1)) + SEED="${SEEDS[$((i - 1))]}" + + # Build ips_fixed list (all peers except self) + IPS_FIXED="" + for j in $(seq 1 "$NUM_NODES"); do + if [ "$j" -ne "$i" ]; then + IPS_FIXED="${IPS_FIXED}127.0.0.1 $((PEER_PORT_BASE + j - 1)) +" + fi + done + + cat > "$NODE_DIR/xrpld.cfg" < "$NODE_DIR/stdout.log" 2>&1 & + echo $! > "$NODE_DIR/xrpld.pid" + log " Node $i started (PID $(cat "$NODE_DIR/xrpld.pid"))" +done + +# Give nodes a moment to initialize +sleep 5 + +# --------------------------------------------------------------------------- +# Step 6: Wait for consensus +# --------------------------------------------------------------------------- +log "Waiting for nodes to reach 'proposing' state (timeout: ${CONSENSUS_TIMEOUT}s)..." + +start_time=$(date +%s) +nodes_ready=0 + +while [ "$nodes_ready" -lt "$NUM_NODES" ]; do + elapsed=$(( $(date +%s) - start_time )) + if [ "$elapsed" -ge "$CONSENSUS_TIMEOUT" ]; then + fail "Consensus timeout after ${CONSENSUS_TIMEOUT}s ($nodes_ready/$NUM_NODES nodes ready)" + log "Continuing with partial consensus..." + break + fi + + nodes_ready=0 + for i in $(seq 1 "$NUM_NODES"); do + RPC_PORT=$((RPC_PORT_BASE + i - 1)) + state=$(curl -sf "http://localhost:$RPC_PORT" \ + -d '{"method":"server_info"}' 2>/dev/null \ + | jq -r '.result.info.server_state' 2>/dev/null || echo "unreachable") + if [ "$state" = "proposing" ]; then + nodes_ready=$((nodes_ready + 1)) + fi + done + printf "\r %d/%d nodes proposing (%ds elapsed)..." "$nodes_ready" "$NUM_NODES" "$elapsed" + if [ "$nodes_ready" -lt "$NUM_NODES" ]; then + sleep 3 + fi +done +echo "" + +if [ "$nodes_ready" -eq "$NUM_NODES" ]; then + ok "All $NUM_NODES nodes reached 'proposing' state" +else + fail "Only $nodes_ready/$NUM_NODES nodes reached 'proposing' state" +fi + +# --------------------------------------------------------------------------- +# Step 6b: Wait for validated ledger +# --------------------------------------------------------------------------- +log "Waiting for first validated ledger..." +for attempt in $(seq 1 60); do + val_seq=$(curl -sf "http://localhost:$RPC_PORT_BASE" \ + -d '{"method":"server_info"}' 2>/dev/null \ + | jq -r '.result.info.validated_ledger.seq // 0' 2>/dev/null || echo 0) + if [ "$val_seq" -gt 2 ] 2>/dev/null; then + ok "First validated ledger: seq $val_seq" + break + fi + if [ "$attempt" -eq 60 ]; then + fail "No validated ledger after 60s" + fi + sleep 1 +done + +# --------------------------------------------------------------------------- +# Step 7: Exercise RPC spans (Phase 2) +# --------------------------------------------------------------------------- +log "Exercising RPC spans..." + +curl -sf "http://localhost:$RPC_PORT_BASE" \ + -d '{"method":"server_info"}' > /dev/null +curl -sf "http://localhost:$RPC_PORT_BASE" \ + -d '{"method":"server_state"}' > /dev/null +curl -sf "http://localhost:$RPC_PORT_BASE" \ + -d '{"method":"ledger","params":[{"ledger_index":"current"}]}' > /dev/null + +log "RPC commands sent. Waiting 5s for batch export..." +sleep 5 + +# --------------------------------------------------------------------------- +# Step 8: Submit transaction (Phase 3) +# --------------------------------------------------------------------------- +log "Submitting Payment transaction..." + +# Generate a destination wallet +log " Generating destination wallet..." +wallet_result=$(curl -sf "http://localhost:$RPC_PORT_BASE" \ + -d '{"method":"wallet_propose"}') +DEST_ACCOUNT=$(echo "$wallet_result" | jq -r '.result.account_id' 2>/dev/null) +if [ -z "$DEST_ACCOUNT" ] || [ "$DEST_ACCOUNT" = "null" ]; then + fail "Could not generate destination wallet" + DEST_ACCOUNT="rrrrrrrrrrrrrrrrrrrrrhoLvTp" # ACCOUNT_ZERO fallback +fi +log " Destination: $DEST_ACCOUNT" + +# Get genesis account info +acct_result=$(curl -sf "http://localhost:$RPC_PORT_BASE" \ + -d "{\"method\":\"account_info\",\"params\":[{\"account\":\"$GENESIS_ACCOUNT\"}]}") +seq_num=$(echo "$acct_result" | jq -r '.result.account_data.Sequence' 2>/dev/null || echo "unknown") +log " Genesis account sequence: $seq_num" + +# Submit payment +submit_result=$(curl -sf "http://localhost:$RPC_PORT_BASE" -d "{ + \"method\": \"submit\", + \"params\": [{ + \"secret\": \"$GENESIS_SEED\", + \"tx_json\": { + \"TransactionType\": \"Payment\", + \"Account\": \"$GENESIS_ACCOUNT\", + \"Destination\": \"$DEST_ACCOUNT\", + \"Amount\": \"10000000\" + } + }] +}") + +engine_result=$(echo "$submit_result" | jq -r '.result.engine_result' 2>/dev/null || echo "unknown") +tx_hash=$(echo "$submit_result" | jq -r '.result.tx_json.hash' 2>/dev/null || echo "unknown") + +if [ "$engine_result" = "tesSUCCESS" ] || [ "$engine_result" = "terQUEUED" ]; then + ok "Transaction submitted: $engine_result (hash: ${tx_hash:0:16}...)" +else + fail "Transaction submission: $engine_result" + log " Full response: $(echo "$submit_result" | jq -c .result 2>/dev/null)" +fi + +log "Waiting 15s for consensus round + batch export..." +sleep 15 + +# --------------------------------------------------------------------------- +# Step 9: Verify Tempo traces +# --------------------------------------------------------------------------- +log "Verifying spans in Tempo..." + +# Check service registration +services=$(curl -sf "$TEMPO/api/v2/search/tag/resource.service.name/values" \ + | jq -r '.tagValues[].value' 2>/dev/null || echo "") +if echo "$services" | grep -q "rippled"; then + ok "Service 'rippled' registered in Tempo" +else + fail "Service 'rippled' NOT found in Tempo (found: $services)" +fi + +log "" +log "--- Phase 2: RPC Spans ---" +check_span "rpc.request" +check_span "rpc.process" +check_span "rpc.command.server_info" +check_span "rpc.command.server_state" +check_span "rpc.command.ledger" + +log "" +log "--- Phase 3: Transaction Spans ---" +check_span "tx.process" +check_span "tx.receive" + +log "" +log "--- Phase 4: Consensus Spans ---" +check_span "consensus.proposal.send" +check_span "consensus.ledger_close" +check_span "consensus.accept" +check_span "consensus.validation.send" + +# --------------------------------------------------------------------------- +# Step 10: Verify Prometheus spanmetrics +# --------------------------------------------------------------------------- +log "" +log "--- Phase 5: Spanmetrics ---" +log "Waiting 20s for Prometheus scrape cycle..." +sleep 20 + +calls_count=$(curl -sf "$PROM/api/v1/query?query=traces_span_metrics_calls_total" \ + | jq '.data.result | length' 2>/dev/null || echo 0) +if [ "$calls_count" -gt 0 ]; then + ok "Prometheus: traces_span_metrics_calls_total ($calls_count series)" +else + fail "Prometheus: traces_span_metrics_calls_total (0 series)" +fi + +duration_count=$(curl -sf "$PROM/api/v1/query?query=traces_span_metrics_duration_milliseconds_count" \ + | jq '.data.result | length' 2>/dev/null || echo 0) +if [ "$duration_count" -gt 0 ]; then + ok "Prometheus: duration histogram ($duration_count series)" +else + fail "Prometheus: duration histogram (0 series)" +fi + +# Check Grafana +if curl -sf http://localhost:3000/api/health > /dev/null 2>&1; then + ok "Grafana: healthy at localhost:3000" +else + fail "Grafana: not reachable at localhost:3000" +fi + +# --------------------------------------------------------------------------- +# Step 11: Summary +# --------------------------------------------------------------------------- +echo "" +echo "===========================================================" +echo " INTEGRATION TEST RESULTS" +echo "===========================================================" +printf " \033[1;32mPASSED: %d\033[0m\n" "$PASS" +printf " \033[1;31mFAILED: %d\033[0m\n" "$FAIL" +echo "===========================================================" +echo "" +echo " Observability stack is running:" +echo "" +echo " Tempo: http://localhost:3200" +echo " Grafana: http://localhost:3000" +echo " Prometheus: http://localhost:9090" +echo "" +echo " xrpld nodes (6) are running:" +for i in $(seq 1 "$NUM_NODES"); do + RPC_PORT=$((RPC_PORT_BASE + i - 1)) + PEER_PORT=$((PEER_PORT_BASE + i - 1)) + echo " Node $i: RPC=localhost:$RPC_PORT Peer=:$PEER_PORT PID=$(cat "$WORKDIR/node$i/xrpld.pid" 2>/dev/null || echo 'unknown')" +done +echo "" +echo " To tear down:" +echo " bash docker/telemetry/integration-test.sh --cleanup" +echo "" +echo "===========================================================" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml index 104f03dd7c..d3b97ae00c 100644 --- a/docker/telemetry/otel-collector-config.yaml +++ b/docker/telemetry/otel-collector-config.yaml @@ -1,9 +1,12 @@ # OpenTelemetry Collector configuration for xrpld development. # -# Pipeline: OTLP receiver -> batch processor -> debug + Tempo. +# Pipelines: +# traces: OTLP receiver -> batch processor -> debug + Tempo + spanmetrics +# metrics: spanmetrics connector -> Prometheus exporter +# # xrpld sends traces via OTLP/HTTP to port 4318. The collector batches -# them and forwards to Tempo via OTLP/gRPC on the Docker network. Tempo -# is queryable via Grafana Explore using TraceQL. +# them, forwards to Tempo, and derives RED metrics via the spanmetrics +# connector, which Prometheus scrapes on port 8889. receivers: otlp: @@ -18,6 +21,21 @@ processors: timeout: 1s send_batch_size: 100 +connectors: + spanmetrics: + # Expose service.instance.id (node public key) as a Prometheus label so + # Grafana dashboards can filter metrics by individual node. + resource_metrics_key_attributes: + - service.instance.id + histogram: + explicit: + buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s] + dimensions: + - name: xrpl.rpc.command + - name: xrpl.rpc.status + - name: xrpl.consensus.mode + - name: xrpl.tx.local + exporters: debug: verbosity: detailed @@ -25,6 +43,8 @@ exporters: endpoint: tempo:4317 tls: insecure: true + prometheus: + endpoint: 0.0.0.0:8889 extensions: health_check: @@ -36,4 +56,7 @@ service: traces: receivers: [otlp] processors: [batch] - exporters: [debug, otlp/tempo] + exporters: [debug, otlp/tempo, spanmetrics] + metrics: + receivers: [spanmetrics] + exporters: [prometheus] diff --git a/docker/telemetry/prometheus.yml b/docker/telemetry/prometheus.yml new file mode 100644 index 0000000000..d99d919a55 --- /dev/null +++ b/docker/telemetry/prometheus.yml @@ -0,0 +1,9 @@ +# Prometheus configuration for scraping spanmetrics from OTel Collector. +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: otel-collector + static_configs: + - targets: ["otel-collector:8889"] diff --git a/docker/telemetry/xrpld-telemetry.cfg b/docker/telemetry/xrpld-telemetry.cfg new file mode 100644 index 0000000000..2a96dd6ab5 --- /dev/null +++ b/docker/telemetry/xrpld-telemetry.cfg @@ -0,0 +1,60 @@ +# Standalone xrpld configuration with OpenTelemetry enabled. +# +# Usage: +# 1. Start the observability stack: +# docker compose -f docker/telemetry/docker-compose.yml up -d +# 2. Run xrpld in standalone mode: +# ./xrpld --conf docker/telemetry/xrpld-telemetry.cfg -a --start +# 3. Send RPC commands to exercise tracing: +# curl -s http://localhost:5005 -d '{"method":"server_info"}' +# 4. View traces in Jaeger UI: http://localhost:16686 + +[server] +port_rpc_admin_local +port_ws_admin_local + +[port_rpc_admin_local] +port = 5005 +ip = 127.0.0.1 +admin = 127.0.0.1 +protocol = http + +[port_ws_admin_local] +port = 6006 +ip = 127.0.0.1 +admin = 127.0.0.1 +protocol = ws + +[node_db] +type=NuDB +path=docker/telemetry/data/nudb +online_delete=256 +advisory_delete=0 + +[database_path] +docker/telemetry/data + +[debug_logfile] +docker/telemetry/data/debug.log + +[rpc_startup] +{ "command": "log_level", "severity": "debug" } + +[ssl_verify] +0 + +# --- OpenTelemetry tracing --- +[telemetry] +enabled=1 +service_instance_id=rippled-standalone +endpoint=http://localhost:4318/v1/traces +exporter=otlp_http +sampling_ratio=1.0 +batch_size=512 +batch_delay_ms=5000 +max_queue_size=2048 +trace_rpc=1 +trace_transactions=1 +trace_consensus=1 +trace_peer=0 +trace_ledger=1 diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md new file mode 100644 index 0000000000..8d798e3353 --- /dev/null +++ b/docs/telemetry-runbook.md @@ -0,0 +1,232 @@ +# rippled Telemetry Operator Runbook + +## Overview + +rippled supports OpenTelemetry distributed tracing to provide visibility into RPC requests, transaction processing, and consensus rounds. + +## Quick Start + +### 1. Start the observability stack + +```bash +docker compose -f docker/telemetry/docker-compose.yml up -d +``` + +This starts: + +- **OTel Collector** on ports 4317 (gRPC) and 4318 (HTTP) +- **Jaeger** UI on http://localhost:16686 +- **Prometheus** on http://localhost:9090 +- **Grafana** on http://localhost:3000 + +### 2. Enable telemetry in rippled + +Add to your `xrpld.cfg`: + +```ini +[telemetry] +enabled=1 +endpoint=http://localhost:4318/v1/traces +``` + +### 3. Build with telemetry support + +```bash +conan install . --build=missing -o telemetry=True +cmake --preset default -Dtelemetry=ON +cmake --build --preset default +``` + +## Configuration Reference + +| Option | Default | Description | +| -------------------- | --------------------------------- | ----------------------------------------- | +| `enabled` | `0` | Master switch for telemetry | +| `endpoint` | `http://localhost:4318/v1/traces` | OTLP/HTTP endpoint | +| `exporter` | `otlp_http` | Exporter type | +| `sampling_ratio` | `1.0` | Head-based sampling ratio (0.0–1.0) | +| `trace_rpc` | `1` | Enable RPC request tracing | +| `trace_transactions` | `1` | Enable transaction tracing | +| `trace_consensus` | `1` | Enable consensus tracing | +| `trace_peer` | `0` | Enable peer message tracing (high volume) | +| `trace_ledger` | `1` | Enable ledger tracing | +| `batch_size` | `512` | Max spans per batch export | +| `batch_delay_ms` | `5000` | Delay between batch exports | +| `max_queue_size` | `2048` | Max spans queued before dropping | +| `use_tls` | `0` | Use TLS for exporter connection | +| `tls_ca_cert` | (empty) | Path to CA certificate bundle | + +## Span Reference + +All spans instrumented in rippled, grouped by subsystem: + +### RPC Spans (Phase 2) + +| Span Name | Source File | Attributes | Description | +| -------------------- | --------------------- | ------------------------------------------------------- | -------------------------------------------------- | +| `rpc.request` | ServerHandler.cpp:271 | — | Top-level HTTP RPC request | +| `rpc.process` | ServerHandler.cpp:573 | — | RPC processing (child of rpc.request) | +| `rpc.ws_message` | ServerHandler.cpp:384 | — | WebSocket RPC message | +| `rpc.command.` | RPCHandler.cpp:161 | `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role` | Per-command span (e.g., `rpc.command.server_info`) | + +### Transaction Spans (Phase 3) + +| Span Name | Source File | Attributes | Description | +| ------------ | ------------------- | ----------------------------------------------- | ------------------------------------- | +| `tx.process` | NetworkOPs.cpp:1227 | `xrpl.tx.hash`, `xrpl.tx.local`, `xrpl.tx.path` | Transaction submission and processing | +| `tx.receive` | PeerImp.cpp:1273 | `xrpl.peer.id` | Transaction received from peer relay | + +### Consensus Spans (Phase 4) + +| Span Name | Source File | Attributes | Description | +| --------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------ | +| `consensus.proposal.send` | RCLConsensus.cpp:177 | `xrpl.consensus.round` | Consensus proposal broadcast | +| `consensus.ledger_close` | RCLConsensus.cpp:282 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | +| `consensus.accept` | RCLConsensus.cpp:395 | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | Ledger accepted by consensus | +| `consensus.validation.send` | RCLConsensus.cpp:753 | `xrpl.consensus.ledger.seq`, `xrpl.consensus.proposing` | Validation sent after accept | +| `consensus.accept.apply` | RCLConsensus.cpp:453 | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state`, `proposing`, `round_time_ms`, `ledger.seq` | Ledger application with close time details | + +#### Close Time Queries (Tempo TraceQL) + +``` +# Find rounds where validators disagreed on close time +{name="consensus.accept.apply"} | xrpl.consensus.close_time_correct = false + +# Find consensus failures (moved_on) +{name="consensus.accept.apply"} | xrpl.consensus.state = "moved_on" + +# Find slow ledger applications (>5s) +{name="consensus.accept.apply"} | duration > 5s + +# Find specific ledger's consensus details +{name="consensus.accept.apply"} | xrpl.consensus.ledger.seq = 92345678 +``` + +## Prometheus Metrics (Spanmetrics) + +The OTel Collector's spanmetrics connector automatically derives RED (Rate, Errors, Duration) metrics from every span. No custom metrics code is needed in rippled. + +### Generated Metric Names + +| Prometheus Metric | Type | Description | +| -------------------------------------------------- | --------- | ---------------------------- | +| `traces_span_metrics_calls_total` | Counter | Total span invocations | +| `traces_span_metrics_duration_milliseconds_bucket` | Histogram | Latency distribution buckets | +| `traces_span_metrics_duration_milliseconds_count` | Histogram | Latency observation count | +| `traces_span_metrics_duration_milliseconds_sum` | Histogram | Cumulative latency | + +### Metric Labels + +Every metric carries these standard labels: + +| Label | Source | Example | +| -------------- | ------------------ | ---------------------------------------- | +| `span_name` | Span name | `rpc.command.server_info` | +| `status_code` | Span status | `STATUS_CODE_UNSET`, `STATUS_CODE_ERROR` | +| `service_name` | Resource attribute | `rippled` | +| `span_kind` | Span kind | `SPAN_KIND_INTERNAL` | + +Additionally, span attributes configured as dimensions in the collector become metric labels (dots → underscores): + +| Span Attribute | Metric Label | Applies To | +| --------------------- | --------------------- | ------------------------------ | +| `xrpl.rpc.command` | `xrpl_rpc_command` | `rpc.command.*` spans | +| `xrpl.rpc.status` | `xrpl_rpc_status` | `rpc.command.*` spans | +| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` spans | +| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` spans | + +### Histogram Buckets + +Configured in `otel-collector-config.yaml`: + +``` +1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s +``` + +## Grafana Dashboards + +Three dashboards are pre-provisioned in `docker/telemetry/grafana/dashboards/`: + +### RPC Performance (`rippled-rpc-perf`) + +| Panel | Type | PromQL | Labels Used | +| --------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------- | +| RPC Request Rate by Command | timeseries | `sum by (xrpl_rpc_command) (rate(traces_span_metrics_calls_total{span_name=~"rpc.command.*"}[5m]))` | `xrpl_rpc_command` | +| RPC Latency p95 by Command | timeseries | `histogram_quantile(0.95, sum by (le, xrpl_rpc_command) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])))` | `xrpl_rpc_command` | +| RPC Error Rate | bargauge | Error spans / total spans × 100, grouped by `xrpl_rpc_command` | `xrpl_rpc_command`, `status_code` | +| RPC Latency Heatmap | heatmap | `sum(increase(traces_span_metrics_duration_milliseconds_bucket{span_name=~"rpc.command.*"}[5m])) by (le)` | `le` (bucket boundaries) | + +### Transaction Overview (`rippled-transactions`) + +| Panel | Type | PromQL | Labels Used | +| --------------------------------- | ---------- | -------------------------------------------------------------------------------------------- | --------------- | +| Transaction Processing Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m])` and `tx.receive` | `span_name` | +| Transaction Processing Latency | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="tx.process"})` | — | +| Transaction Path Distribution | piechart | `sum by (xrpl_tx_local) (rate(traces_span_metrics_calls_total{span_name="tx.process"}[5m]))` | `xrpl_tx_local` | +| Transaction Receive vs Suppressed | timeseries | `rate(traces_span_metrics_calls_total{span_name="tx.receive"}[5m])` | — | + +### Consensus Health (`rippled-consensus`) + +| Panel | Type | PromQL | Labels Used | +| ----------------------------- | ---------- | ---------------------------------------------------------------------------------- | ----------- | +| Consensus Round Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept"})` | — | +| Consensus Proposals Sent Rate | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.proposal.send"}[5m])` | — | +| Ledger Close Duration | timeseries | `histogram_quantile(0.95, ... {span_name="consensus.ledger_close"})` | — | +| Validation Send Rate | stat | `rate(traces_span_metrics_calls_total{span_name="consensus.validation.send"}[5m])` | — | +| Ledger Apply Duration | timeseries | `histogram_quantile(0.95 / 0.50, ... {span_name="consensus.accept.apply"})` | — | +| Close Time Agreement | timeseries | `rate(traces_span_metrics_calls_total{span_name="consensus.accept.apply"}[5m])` | — | + +### Span → Metric → Dashboard Summary + +| Span Name | Prometheus Metric Filter | Grafana Dashboard | +| --------------------------- | ----------------------------------------- | --------------------------------------------- | +| `rpc.request` | `{span_name="rpc.request"}` | — (available but not paneled) | +| `rpc.process` | `{span_name="rpc.process"}` | — (available but not paneled) | +| `rpc.command.*` | `{span_name=~"rpc.command.*"}` | RPC Performance (all 4 panels) | +| `tx.process` | `{span_name="tx.process"}` | Transaction Overview (3 panels) | +| `tx.receive` | `{span_name="tx.receive"}` | Transaction Overview (2 panels) | +| `consensus.accept` | `{span_name="consensus.accept"}` | Consensus Health (Round Duration) | +| `consensus.proposal.send` | `{span_name="consensus.proposal.send"}` | Consensus Health (Proposals Rate) | +| `consensus.ledger_close` | `{span_name="consensus.ledger_close"}` | Consensus Health (Close Duration) | +| `consensus.validation.send` | `{span_name="consensus.validation.send"}` | Consensus Health (Validation Rate) | +| `consensus.accept.apply` | `{span_name="consensus.accept.apply"}` | Consensus Health (Apply Duration, Close Time) | + +## Troubleshooting + +### No traces appearing in Jaeger + +1. Check rippled logs for `Telemetry starting` message +2. Verify `enabled=1` in the `[telemetry]` config section +3. Test collector connectivity: `curl -v http://localhost:4318/v1/traces` +4. Check collector logs: `docker compose logs otel-collector` + +### High memory usage + +- Reduce `sampling_ratio` (e.g., `0.1` for 10% sampling) +- Reduce `max_queue_size` and `batch_size` +- Disable high-volume trace categories: `trace_peer=0` + +### Collector connection failures + +- Verify endpoint URL matches collector address +- Check firewall rules for ports 4317/4318 +- If using TLS, verify certificate path with `tls_ca_cert` + +## Performance Tuning + +| Scenario | Recommendation | +| ------------------------ | ------------------------------------------------- | +| Production mainnet | `sampling_ratio=0.01`, `trace_peer=0` | +| Testnet/devnet | `sampling_ratio=1.0` (full tracing) | +| Debugging specific issue | `sampling_ratio=1.0` temporarily | +| High-throughput node | Increase `batch_size=1024`, `max_queue_size=4096` | + +## Disabling Telemetry + +Set `enabled=0` in config (runtime disable) or build without the flag: + +```bash +cmake --preset default -Dtelemetry=OFF +``` + +When telemetry is compiled out, all trace macros expand to no-ops with zero overhead. diff --git a/include/xrpl/proto/xrpl.proto b/include/xrpl/proto/xrpl.proto index c2632d8ea9..acf2541695 100644 --- a/include/xrpl/proto/xrpl.proto +++ b/include/xrpl/proto/xrpl.proto @@ -91,6 +91,10 @@ message TraceContext { optional bytes trace_id = 1; // 16-byte trace identifier optional bytes span_id = 2; // 8-byte parent span identifier optional uint32 trace_flags = 3; // bit 0 = sampled + // TODO: trace_state is reserved for W3C tracestate vendor-specific + // key-value pairs but is not yet read or written by + // TraceContextPropagator. Wire it when cross-vendor trace + // propagation is needed. optional string trace_state = 4; // W3C tracestate header value } diff --git a/include/xrpl/telemetry/TraceContextPropagator.h b/include/xrpl/telemetry/TraceContextPropagator.h index b897541267..f5135e349c 100644 --- a/include/xrpl/telemetry/TraceContextPropagator.h +++ b/include/xrpl/telemetry/TraceContextPropagator.h @@ -6,6 +6,13 @@ Protocol Buffer TraceContext messages (P2P cross-node propagation). Only compiled when XRPL_ENABLE_TELEMETRY is defined. + + TODO: These utilities are not yet wired into the P2P message flow. + To enable cross-node distributed traces, call injectToProtobuf() in + PeerImp when sending TMTransaction/TMProposeSet messages, and call + extractFromProtobuf() in the corresponding message handlers to + reconstruct the parent span context before starting a child span. + This was deferred to validate single-node tracing performance first. */ #ifdef XRPL_ENABLE_TELEMETRY diff --git a/src/libxrpl/telemetry/Telemetry.cpp b/src/libxrpl/telemetry/Telemetry.cpp index fb70fd66db..9e0ee54ce5 100644 --- a/src/libxrpl/telemetry/Telemetry.cpp +++ b/src/libxrpl/telemetry/Telemetry.cpp @@ -321,6 +321,13 @@ public: // Force flush with timeout to avoid blocking indefinitely // when the OTLP endpoint is unreachable. sdkProvider_->ForceFlush(std::chrono::milliseconds(5000)); + // TODO: sdkProvider_ is not thread-safe. This reset() races with + // getTracer() if any thread is still calling startSpan(). + // Currently safe because Application::stop() shuts down + // serverHandler_, overlay_, and jobQueue_ before calling + // telemetry_->stop() — so no callers should remain. If the + // shutdown order ever changes, add an std::atomic stopped_ + // flag checked in getTracer() to make this robust. sdkProvider_.reset(); trace_api::Provider::SetTracerProvider( opentelemetry::nostd::shared_ptr( diff --git a/src/xrpld/app/consensus/RCLConsensus.cpp b/src/xrpld/app/consensus/RCLConsensus.cpp index 0197c2ffcd..39133bc060 100644 --- a/src/xrpld/app/consensus/RCLConsensus.cpp +++ b/src/xrpld/app/consensus/RCLConsensus.cpp @@ -284,7 +284,7 @@ RCLConsensus::Adaptor::onClose( span.setAttribute( telemetry::cons_span::attr::ledgerSeq, static_cast(ledger.ledger_->header().seq + 1)); - span.setAttribute(telemetry::cons_span::attr::mode, to_string(mode).c_str()); + span.setAttribute(telemetry::cons_span::attr::mode, toDisplayString(mode).c_str()); bool const wrongLCL = mode == ConsensusMode::wrongLedger; bool const proposing = mode == ConsensusMode::proposing; @@ -906,8 +906,8 @@ RCLConsensus::Adaptor::onModeChange(ConsensusMode before, ConsensusMode after) { auto span = telemetry::SpanGuard::span( telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "mode_change"); - span.setAttribute(telemetry::cons_span::attr::modeOld, to_string(before).c_str()); - span.setAttribute(telemetry::cons_span::attr::modeNew, to_string(after).c_str()); + span.setAttribute(telemetry::cons_span::attr::modeOld, toDisplayString(before).c_str()); + span.setAttribute(telemetry::cons_span::attr::modeNew, toDisplayString(after).c_str()); JLOG(j_.info()) << "Consensus mode change before=" << to_string(before) << ", after=" << to_string(after); @@ -1119,7 +1119,7 @@ RCLConsensus::Adaptor::startRoundTracing(RCLCxLedger const& prevLgr) roundSpan_->setAttribute(cons_span::attr::ledgerId, to_string(prevLgr.id()).c_str()); roundSpan_->setAttribute(cons_span::attr::ledgerSeq, static_cast(prevLgr.seq() + 1)); - roundSpan_->setAttribute(cons_span::attr::mode, to_string(mode_.load()).c_str()); + roundSpan_->setAttribute(cons_span::attr::mode, toDisplayString(mode_.load()).c_str()); roundSpan_->setAttribute(cons_span::attr::traceStrategy, strategy.c_str()); roundSpan_->setAttribute(cons_span::attr::roundId, static_cast(prevLgr.seq() + 1)); diff --git a/src/xrpld/app/main/Application.cpp b/src/xrpld/app/main/Application.cpp index f5a9dcbc4d..226be27ca0 100644 --- a/src/xrpld/app/main/Application.cpp +++ b/src/xrpld/app/main/Application.cpp @@ -1595,6 +1595,10 @@ ApplicationImp::run() ledgerCleaner_->stop(); m_nodeStore->stop(); perfLog_->stop(); + // Telemetry must stop last among trace-producing components. + // serverHandler_, overlay_, and jobQueue_ are already stopped above, + // so no threads should be calling startSpan() at this point. + // See TODO in TelemetryImpl::stop() re: thread-safety of sdkProvider_. telemetry_->stop(); JLOG(m_journal.info()) << "Done."; diff --git a/src/xrpld/consensus/ConsensusTypes.h b/src/xrpld/consensus/ConsensusTypes.h index 8aba48f34e..664e8d549c 100644 --- a/src/xrpld/consensus/ConsensusTypes.h +++ b/src/xrpld/consensus/ConsensusTypes.h @@ -66,6 +66,26 @@ to_string(ConsensusMode m) } } +/// Title Case display name for telemetry attributes and dashboards. +/// Separate from to_string() which is used in logs and must remain stable. +inline std::string +toDisplayString(ConsensusMode m) +{ + switch (m) + { + case ConsensusMode::proposing: + return "Proposing"; + case ConsensusMode::observing: + return "Observing"; + case ConsensusMode::wrongLedger: + return "Wrong Ledger"; + case ConsensusMode::switchedLedger: + return "Switched Ledger"; + default: + return "Unknown"; + } +} + /** Phases of consensus for a single ledger round. @code