From 87ed778efe33aeb42feb3a399953a5f574f73648 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Tue, 31 Mar 2026 22:27:06 +0100 Subject: [PATCH] refactor(telemetry): migrate integration test and docs from Jaeger to Tempo API Co-Authored-By: Claude Opus 4.6 (1M context) --- .../Phase5_IntegrationTest_taskList.md | 42 ++++++------- docker/telemetry/TESTING.md | 59 ++++++++++--------- docker/telemetry/integration-test.sh | 33 ++++++----- 3 files changed, 70 insertions(+), 64 deletions(-) diff --git a/OpenTelemetryPlan/Phase5_IntegrationTest_taskList.md b/OpenTelemetryPlan/Phase5_IntegrationTest_taskList.md index 4df073e00e..28f45afc75 100644 --- a/OpenTelemetryPlan/Phase5_IntegrationTest_taskList.md +++ b/OpenTelemetryPlan/Phase5_IntegrationTest_taskList.md @@ -2,11 +2,11 @@ > **Goal**: End-to-end verification of the complete telemetry pipeline using a > 6-node consensus network. Proves that RPC, transaction, and consensus spans -> flow through the observability stack (otel-collector, Jaeger, Prometheus, +> flow through the observability stack (otel-collector, Tempo, Prometheus, > Grafana) under realistic conditions. > > **Scope**: Integration test script, manual testing plan, 6-node local network -> setup, Jaeger/Prometheus/Grafana verification. +> setup, Tempo/Prometheus/Grafana verification. > > **Branch**: `pratik/otel-phase5-docs-deployment` @@ -14,7 +14,7 @@ | Document | Relevance | | ---------------------------------------------------------------- | ------------------------------------------ | -| [07-observability-backends.md](./07-observability-backends.md) | Jaeger, Grafana, Prometheus setup | +| [07-observability-backends.md](./07-observability-backends.md) | Tempo, Grafana, Prometheus setup | | [05-configuration-reference.md](./05-configuration-reference.md) | Collector config, Docker Compose | | [06-implementation-phases.md](./06-implementation-phases.md) | Phase 5 tasks, definition of done | | [Phase5_taskList.md](./Phase5_taskList.md) | Phase 5 main task list (5.6 = integration) | @@ -25,7 +25,7 @@ **Objective**: Automated bash script that stands up a 6-node xrpld network with telemetry, exercises all span categories, and verifies data in -Jaeger/Prometheus. +Tempo/Prometheus. **What to do**: @@ -43,7 +43,7 @@ Jaeger/Prometheus. - [ ] Script starts without errors - [ ] All 6 nodes reach "proposing" state -- [ ] Observability stack is healthy (otel-collector, Jaeger, Prometheus, Grafana) +- [ ] Observability stack is healthy (otel-collector, Tempo, Prometheus, Grafana) --- @@ -55,7 +55,7 @@ Jaeger/Prometheus. - Send `server_info`, `server_state`, `ledger` RPCs to node1 (port 5005) - Wait for batch export (5s) -- Query Jaeger API for: +- Query Tempo API for: - `rpc.request` spans (ServerHandler::onRequest) - `rpc.process` spans (ServerHandler::processRequest) - `rpc.command.server_info` spans (callMethod) @@ -65,9 +65,9 @@ Jaeger/Prometheus. **Verification**: -- [ ] Jaeger shows `rpc.request` traces -- [ ] Jaeger shows `rpc.process` traces -- [ ] Jaeger shows `rpc.command.*` traces with correct attributes +- [ ] Tempo shows `rpc.request` traces +- [ ] Tempo shows `rpc.process` traces +- [ ] Tempo shows `rpc.command.*` traces with correct attributes --- @@ -80,7 +80,7 @@ Jaeger/Prometheus. - Get genesis account sequence via `account_info` RPC - Submit Payment transaction using genesis seed (`snoPBrXtMeMyMHUVTgbuqAfg1SUTb`) - Wait for consensus inclusion (10s) -- Query Jaeger API for: +- Query Tempo API for: - `tx.process` spans (NetworkOPsImp::processTransaction) on submitting node - `tx.receive` spans (PeerImp::handleTransaction) on peer nodes - Verify `xrpl.tx.hash` attribute on `tx.process` spans @@ -88,8 +88,8 @@ Jaeger/Prometheus. **Verification**: -- [ ] Jaeger shows `tx.process` traces with `xrpl.tx.hash` -- [ ] Jaeger shows `tx.receive` traces with `xrpl.peer.id` +- [ ] Tempo shows `tx.process` traces with `xrpl.tx.hash` +- [ ] Tempo shows `tx.receive` traces with `xrpl.peer.id` --- @@ -100,7 +100,7 @@ Jaeger/Prometheus. **What to do**: - Consensus runs automatically in 6-node network -- Query Jaeger API for: +- Query Tempo API for: - `consensus.proposal.send` (Adaptor::propose) - `consensus.ledger_close` (Adaptor::onClose) - `consensus.accept` (Adaptor::onAccept) @@ -112,10 +112,10 @@ Jaeger/Prometheus. **Verification**: -- [ ] Jaeger shows `consensus.ledger_close` traces with `xrpl.consensus.mode` -- [ ] Jaeger shows `consensus.accept` traces with `xrpl.consensus.proposers` -- [ ] Jaeger shows `consensus.proposal.send` traces -- [ ] Jaeger shows `consensus.validation.send` traces +- [ ] Tempo shows `consensus.ledger_close` traces with `xrpl.consensus.mode` +- [ ] Tempo shows `consensus.accept` traces with `xrpl.consensus.proposers` +- [ ] Tempo shows `consensus.proposal.send` traces +- [ ] Tempo shows `consensus.validation.send` traces --- @@ -148,7 +148,7 @@ Jaeger/Prometheus. - Single-node standalone test (quick verification) - 6-node consensus test (full verification) - Expected span catalog (all 12 span names with attributes) - - Verification queries (Jaeger API, Prometheus API) + - Verification queries (Tempo API, Prometheus API) - Troubleshooting guide **Key new file**: `docker/telemetry/TESTING.md` @@ -171,14 +171,14 @@ Jaeger/Prometheus. - Debug any failures - Leave stack running for manual verification - Share URLs: - - Jaeger: `http://localhost:16686` + - Tempo: `http://localhost:3200` - Grafana: `http://localhost:3000` - Prometheus: `http://localhost:9090` **Verification**: - [ ] Script completes with all checks passing -- [ ] Jaeger UI shows rippled service with all expected span names +- [ ] Tempo UI shows rippled service with all expected span names - [ ] Grafana dashboards load and show data --- @@ -215,7 +215,7 @@ Jaeger/Prometheus. **Exit Criteria**: - [ ] All 6 xrpld nodes reach "proposing" state -- [ ] All 11 expected span names visible in Jaeger +- [ ] All 11 expected span names visible in Tempo - [ ] Spanmetrics available in Prometheus - [ ] Grafana dashboards show data - [ ] Manual testing plan document complete diff --git a/docker/telemetry/TESTING.md b/docker/telemetry/TESTING.md index 4d9853e8f7..be36f91d32 100644 --- a/docker/telemetry/TESTING.md +++ b/docker/telemetry/TESTING.md @@ -2,7 +2,7 @@ This document describes how to verify the rippled OpenTelemetry telemetry pipeline end-to-end, from span generation through the observability stack -(otel-collector, Jaeger, Prometheus, Grafana). +(otel-collector, Tempo, Prometheus, Grafana). --- @@ -49,8 +49,8 @@ Wait for services to be ready: # otel-collector health curl -sf http://localhost:13133/ && echo "collector ready" -# Jaeger UI -curl -sf http://localhost:16686/ > /dev/null && echo "jaeger ready" +# Tempo readiness +curl -sf http://localhost:3200/ready > /dev/null && echo "tempo ready" ``` ### Step 2: Start xrpld in standalone mode @@ -111,32 +111,36 @@ Close the ledger again to finalize: curl -s http://localhost:5005 -d '{"method":"ledger_accept"}' ``` -### Step 5: Verify traces in Jaeger +### Step 5: Verify traces in Tempo Wait 5 seconds for the batch export, then: ```bash -JAEGER="http://localhost:16686" +TEMPO="http://localhost:3200" # Check rippled service is registered -curl -s "$JAEGER/api/services" | jq '.data' +curl -s "$TEMPO/api/v2/search/tag/resource.service.name/values" | jq '.tagValues[].value' # Check RPC spans -curl -s "$JAEGER/api/traces?service=rippled&operation=rpc.request&limit=5&lookback=1h" \ - | jq '.data | length' +curl -s "$TEMPO/api/search" \ + --data-urlencode 'q={resource.service.name="rippled" && name="rpc.request"}' \ + --data-urlencode 'limit=5' | jq '.traces | length' -curl -s "$JAEGER/api/traces?service=rippled&operation=rpc.process&limit=5&lookback=1h" \ - | jq '.data | length' +curl -s "$TEMPO/api/search" \ + --data-urlencode 'q={resource.service.name="rippled" && name="rpc.process"}' \ + --data-urlencode 'limit=5' | jq '.traces | length' -curl -s "$JAEGER/api/traces?service=rippled&operation=rpc.command.server_info&limit=5&lookback=1h" \ - | jq '.data | length' +curl -s "$TEMPO/api/search" \ + --data-urlencode 'q={resource.service.name="rippled" && name="rpc.command.server_info"}' \ + --data-urlencode 'limit=5' | jq '.traces | length' # Check transaction spans -curl -s "$JAEGER/api/traces?service=rippled&operation=tx.process&limit=5&lookback=1h" \ - | jq '.data | length' +curl -s "$TEMPO/api/search" \ + --data-urlencode 'q={resource.service.name="rippled" && name="tx.process"}' \ + --data-urlencode 'limit=5' | jq '.traces | length' ``` -Or open the Jaeger UI: http://localhost:16686 +Or open Grafana Explore with Tempo datasource: http://localhost:3000 ### Step 6: Teardown @@ -189,7 +193,7 @@ The script will: 4. Start all 6 nodes 5. Wait for consensus ("proposing" state) 6. Exercise RPC, submit transactions -7. Verify all span categories in Jaeger +7. Verify all span categories in Tempo 8. Verify spanmetrics in Prometheus 9. Print results and leave the stack running @@ -362,7 +366,7 @@ curl -s http://localhost:5005 -d '{ Wait 15 seconds for consensus and batch export. -#### Step 8: Verify in Jaeger +#### Step 8: Verify in Tempo See the "Verification Queries" section below. @@ -390,18 +394,15 @@ All 12 production span names instrumented across Phases 2-4: ## Verification Queries -### Jaeger API +### Tempo API -Base URL: `http://localhost:16686` +Base URL: `http://localhost:3200` ```bash -JAEGER="http://localhost:16686" +TEMPO="http://localhost:3200" # List all services -curl -s "$JAEGER/api/services" | jq '.data' - -# List operations for rippled -curl -s "$JAEGER/api/services/rippled/operations" | jq '.data' +curl -s "$TEMPO/api/v2/search/tag/resource.service.name/values" | jq '.tagValues[].value' # Query traces by operation for op in "rpc.request" "rpc.process" \ @@ -410,8 +411,10 @@ for op in "rpc.request" "rpc.process" \ "consensus.proposal.send" "consensus.ledger_close" \ "consensus.accept" "consensus.accept.apply" \ "consensus.validation.send"; do - count=$(curl -s "$JAEGER/api/traces?service=rippled&operation=$op&limit=5&lookback=1h" \ - | jq '.data | length') + count=$(curl -s "$TEMPO/api/search" \ + --data-urlencode "q={resource.service.name=\"rippled\" && name=\"$op\"}" \ + --data-urlencode "limit=5" \ + | jq '.traces | length') printf "%-35s %s traces\n" "$op" "$count" done ``` @@ -448,14 +451,14 @@ Pre-configured dashboards: Pre-configured datasources: -- **Jaeger**: Trace data at `http://jaeger:16686` +- **Tempo**: Trace data at `http://tempo:3200` - **Prometheus**: Metrics at `http://prometheus:9090` --- ## Troubleshooting -### No traces in Jaeger +### No traces in Tempo 1. Check otel-collector logs: ```bash diff --git a/docker/telemetry/integration-test.sh b/docker/telemetry/integration-test.sh index 86b423fd03..1a48aa324a 100755 --- a/docker/telemetry/integration-test.sh +++ b/docker/telemetry/integration-test.sh @@ -3,7 +3,7 @@ # # Launches a 6-node xrpld consensus network with telemetry enabled, # exercises RPC / transaction / consensus code paths, then verifies -# that the expected spans and metrics appear in Jaeger and Prometheus. +# that the expected spans and metrics appear in Tempo and Prometheus. # # Usage: # bash docker/telemetry/integration-test.sh @@ -14,7 +14,7 @@ # - curl, jq # # The script leaves the observability stack and xrpld nodes running -# so you can manually inspect Jaeger (localhost:16686) and Grafana +# so you can manually inspect Tempo (localhost:3200) and Grafana # (localhost:3000). Run with --cleanup to tear down instead. set -euo pipefail @@ -35,7 +35,7 @@ CONSENSUS_TIMEOUT=120 GENESIS_ACCOUNT="rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh" GENESIS_SEED="snoPBrXtMeMyMHUVTgbuqAfg1SUTb" DEST_ACCOUNT="" # Generated dynamically via wallet_propose -JAEGER="http://localhost:16686" +TEMPO="http://localhost:3200" PROM="http://localhost:9090" # Counters for pass/fail @@ -53,8 +53,10 @@ die() { printf "\033[1;31m[ERROR]\033[0m %s\n" "$*" >&2; exit 1; } check_span() { local op="$1" local count - count=$(curl -sf "$JAEGER/api/traces?service=rippled&operation=$op&limit=5&lookback=1h" \ - | jq '.data | length' 2>/dev/null || echo 0) + count=$(curl -sf "$TEMPO/api/search" \ + --data-urlencode "q={resource.service.name=\"rippled\" && name=\"$op\"}" \ + --data-urlencode "limit=5" \ + | jq '.traces | length' 2>/dev/null || echo 0) if [ "$count" -gt 0 ]; then ok "$op ($count traces)" else @@ -141,14 +143,14 @@ for attempt in $(seq 1 30); do sleep 1 done -log "Waiting for Jaeger to be ready..." +log "Waiting for Tempo to be ready..." for attempt in $(seq 1 30); do - if curl -sf "$JAEGER/" >/dev/null 2>&1; then - log "Jaeger ready (attempt $attempt)." + if curl -sf "$TEMPO/ready" >/dev/null 2>&1; then + log "Tempo ready (attempt $attempt)." break fi if [ "$attempt" -eq 30 ]; then - die "Jaeger not ready after 30s" + die "Tempo not ready after 30s" fi sleep 1 done @@ -458,16 +460,17 @@ log "Waiting 15s for consensus round + batch export..." sleep 15 # --------------------------------------------------------------------------- -# Step 9: Verify Jaeger traces +# Step 9: Verify Tempo traces # --------------------------------------------------------------------------- -log "Verifying spans in Jaeger..." +log "Verifying spans in Tempo..." # Check service registration -services=$(curl -sf "$JAEGER/api/services" | jq -r '.data[]' 2>/dev/null || echo "") +services=$(curl -sf "$TEMPO/api/v2/search/tag/resource.service.name/values" \ + | jq -r '.tagValues[].value' 2>/dev/null || echo "") if echo "$services" | grep -q "rippled"; then - ok "Service 'rippled' registered in Jaeger" + ok "Service 'rippled' registered in Tempo" else - fail "Service 'rippled' NOT found in Jaeger (found: $services)" + fail "Service 'rippled' NOT found in Tempo (found: $services)" fi log "" @@ -534,7 +537,7 @@ echo "===========================================================" echo "" echo " Observability stack is running:" echo "" -echo " Jaeger UI: http://localhost:16686" +echo " Tempo: http://localhost:3200" echo " Grafana: http://localhost:3000" echo " Prometheus: http://localhost:9090" echo ""