From cb9fce689066763a2cd1fe58c23ffb96a1fa1f99 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 5 Jun 2026 17:08:58 +0100 Subject: [PATCH] fix(telemetry): align Phase 10 workload harness with current OTel recording surface + fix CI The Phase 10 validation harness had drifted from the code's recording surface and the telemetry-validation CI job was failing before it could build. CI fix (telemetry-validation.yml): - Replace nonexistent local action ./.github/actions/print-env with the remote XRPLF/actions/print-build-env (the build-xrpld job failed in 56s on this). - Sync prepare-runner and upload-artifact action SHAs to the canonical workflow. Recording-surface reconciliation (docker/telemetry/workload/): - Migrate span attributes from dotted xrpl.. to the bare/underscore form introduced by the 2026-05-13 span-attr naming redesign (tx_hash, peer_id, ledger_seq, consensus_mode, consensus_round, full_validation, quorum, ...). Dotted xrpl.ledger.hash is retained only on peer.validation.receive (shared constant), while consensus.validation.send uses bare ledger_hash. - Fix attribute placement: tx.apply carries tx_count/tx_failed (not ledger_seq); ledger.build carries ledger_seq/close_* (not tx_count/tx_failed). - Replace the phantom rpc.request span with the real WS root rpc.ws_message; drop the never-emitted duration_ms; rebuild the parent-child map accordingly. - Add the new spans the code emits: apply-pipeline stage spans (tx.preflight/preclaim/transactor with stage/tx_type/ter_result), txq.*, consensus sub-spans (round/establish/update_positions/check/phase.open), ledger.acquire, grpc.*, pathfind.*. Conditional spans are marked optional so they are skipped (not failed) when the workload does not exercise them. - validate_telemetry.py: service.name and Loki job label rippled -> xrpld; fix PARITY_SPAN_ATTRS (rename the 4 real attrs, drop the 3 that are metrics not span attrs); add optional-span handling that skips missing optional spans while still validating attributes when present. - expected_metrics.json: rippled_ -> xrpld_ on all beast::insight/overlay metrics, xrpld_job_count, the 15 on-disk xrpld-* dashboard UIDs, and the real bare spanmetrics dimension labels. - regression-metrics.json + baseline-timings.json: rpc.request -> rpc.ws_message. Metrics pipeline fix: - Switch node [insight] config from server=statsd/prefix=rippled to server=otel + /v1/metrics endpoint + prefix=xrpld across run-full-validation.sh, xrpld-validator.cfg.template, benchmark.sh and the workload compose. The collector has no StatsD receiver, so system metrics only reach Prometheus over OTLP. Synthetic load for new spans: - Add ripple_path_find to the RPC load generator (drives pathfind.* spans). - Add a high-TPS txq-burst workload phase to force fee escalation (drives txq.*). All facts verified against the *SpanNames.h headers and a live xrpld node + collector (Tempo service.name=xrpld, tx.preflight attrs [stage,ter_result,tx_type], 279 xrpld_ Prometheus metrics and zero rippled_). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/telemetry-validation.yml | 10 +- OpenTelemetryPlan/Phase10_taskList.md | 11 +- docker/telemetry/docker-compose.workload.yaml | 5 +- docker/telemetry/workload/README.md | 10 +- .../workload/baselines/baseline-timings.json | 6 +- docker/telemetry/workload/benchmark.sh | 6 +- .../telemetry/workload/expected_metrics.json | 97 +++-- docker/telemetry/workload/expected_spans.json | 376 ++++++++++++++---- .../workload/regression-metrics.json | 2 +- .../telemetry/workload/rpc_load_generator.py | 14 +- .../telemetry/workload/run-full-validation.sh | 10 +- .../telemetry/workload/validate_telemetry.py | 109 +++-- .../telemetry/workload/workload-profiles.json | 10 + .../workload/xrpld-validator.cfg.template | 15 +- 14 files changed, 503 insertions(+), 178 deletions(-) diff --git a/.github/workflows/telemetry-validation.yml b/.github/workflows/telemetry-validation.yml index def72009a3..8b4bdcf952 100644 --- a/.github/workflows/telemetry-validation.yml +++ b/.github/workflows/telemetry-validation.yml @@ -88,12 +88,12 @@ jobs: uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Prepare runner - uses: XRPLF/actions/prepare-runner@2cbf481018d930656e9276fcc20dc0e3a0be5b6d + uses: XRPLF/actions/prepare-runner@90f11ee655d1687824fb8793db770477d52afbab with: enable_ccache: ${{ github.repository_owner == 'XRPLF' }} - name: Print build environment - uses: ./.github/actions/print-env + uses: XRPLF/actions/print-build-env@59dec886e4afb05a1724443af08baccbc045b574 - name: Get number of processors uses: XRPLF/actions/get-nproc@cf0433aa74563aead044a1e395610c96d65a37cf @@ -136,7 +136,7 @@ jobs: run: ccache --show-stats -vv - name: Upload xrpld binary - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: xrpld-telemetry path: ${{ env.BUILD_DIR }}/xrpld @@ -193,7 +193,7 @@ jobs: - name: Upload validation reports if: always() - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: telemetry-validation-reports path: /tmp/xrpld-validation/reports/ @@ -201,7 +201,7 @@ jobs: - name: Upload node logs if: failure() - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: xrpld-node-logs path: /tmp/xrpld-validation/node*/debug.log diff --git a/OpenTelemetryPlan/Phase10_taskList.md b/OpenTelemetryPlan/Phase10_taskList.md index a718f9e777..7022652b9e 100644 --- a/OpenTelemetryPlan/Phase10_taskList.md +++ b/OpenTelemetryPlan/Phase10_taskList.md @@ -22,7 +22,7 @@ Before Phases 1-9 can be considered production-ready, we need proof that: -1. All 16 spans fire with correct attributes under real transaction workloads +1. All required spans fire with correct attributes under real transaction workloads 2. All 255+ StatsD metrics + ~50 Phase 9 metrics appear in Prometheus with non-zero values 3. Log-trace correlation (Phase 8) produces clickable trace_id links in Loki 4. All 10 Grafana dashboards render meaningful data (no empty panels) @@ -122,9 +122,12 @@ Before Phases 1-9 can be considered production-ready, we need proof that: - Create `docker/telemetry/workload/validate_telemetry.py`: **Span validation** (queries Tempo API): - - Assert all 16 span names appear in traces - - Assert each span has its required attributes (22 total attributes across spans) - - Assert parent-child relationships are correct (`rpc.request` → `rpc.process` → `rpc.command.*`) + - Assert all required span names appear in traces (conditional spans — `grpc.*`, + `ledger.acquire`, `txq.*`, `consensus.mode_change` — are marked `optional` and + skipped when not exercised by the workload) + - Assert each span has its required attributes (bare/underscore keys per the + 2026-05-13 span-attr naming redesign; dotted `xrpl.*` reserved for resource attrs) + - Assert parent-child relationships are correct (`rpc.ws_message` → `rpc.process` → `rpc.command.*`) - Assert span durations are reasonable (> 0, < 60s) **Metric validation** (queries Prometheus API): diff --git a/docker/telemetry/docker-compose.workload.yaml b/docker/telemetry/docker-compose.workload.yaml index d63f6155c8..cd58fa34d5 100644 --- a/docker/telemetry/docker-compose.workload.yaml +++ b/docker/telemetry/docker-compose.workload.yaml @@ -2,7 +2,7 @@ # # Runs a 5-node validator cluster with full OTel telemetry stack: # - 5 rippled validator nodes (consensus network) -# - OTel Collector (traces + StatsD metrics) +# - OTel Collector (traces + native OTLP metrics) # - Tempo (trace backend + search API) # - Prometheus (metrics) # - Loki (log aggregation for log-trace correlation) @@ -34,8 +34,7 @@ services: command: ["--config=/etc/otel-collector-config.yaml"] ports: - "4317:4317" # OTLP gRPC - - "4318:4318" # OTLP HTTP - - "8125:8125/udp" # StatsD UDP (beast::insight metrics) + - "4318:4318" # OTLP HTTP (traces + beast::insight metrics) - "8889:8889" # Prometheus metrics endpoint - "13133:13133" # Health check volumes: diff --git a/docker/telemetry/workload/README.md b/docker/telemetry/workload/README.md index 5f28cf42e1..eb926af6b9 100644 --- a/docker/telemetry/workload/README.md +++ b/docker/telemetry/workload/README.md @@ -288,10 +288,10 @@ The validation report (`validation-report.json`) is structured as: }, "checks": [ { - "name": "span.rpc.request", + "name": "span.rpc.ws_message", "category": "span", "passed": true, - "message": "rpc.request: 15 traces found", + "message": "rpc.ws_message: 15 traces found", "details": { "trace_count": 15 } } ] @@ -347,10 +347,10 @@ required attributes, and the `config_flag` that must be enabled: ```json { - "name": "rpc.request", + "name": "rpc.command.*", "category": "rpc", - "parent": null, - "required_attributes": ["rpc.method", "rpc.grpc.status_code"], + "parent": "rpc.process", + "required_attributes": ["command", "version", "rpc_role", "rpc_status"], "config_flag": "trace_rpc" } ``` diff --git a/docker/telemetry/workload/baselines/baseline-timings.json b/docker/telemetry/workload/baselines/baseline-timings.json index 6b898fd95d..be0d4ddbd4 100644 --- a/docker/telemetry/workload/baselines/baseline-timings.json +++ b/docker/telemetry/workload/baselines/baseline-timings.json @@ -90,15 +90,15 @@ "unit": "ms", "value": null }, - "span.rpc.request.p50": { + "span.rpc.ws_message.p50": { "unit": "ms", "value": null }, - "span.rpc.request.p95": { + "span.rpc.ws_message.p95": { "unit": "ms", "value": null }, - "span.rpc.request.p99": { + "span.rpc.ws_message.p99": { "unit": "ms", "value": null }, diff --git a/docker/telemetry/workload/benchmark.sh b/docker/telemetry/workload/benchmark.sh index ac419637cc..1ef73755c9 100755 --- a/docker/telemetry/workload/benchmark.sh +++ b/docker/telemetry/workload/benchmark.sh @@ -156,9 +156,9 @@ trace_peer=1 trace_ledger=1 [insight] -server=statsd -address=127.0.0.1:8125 -prefix=rippled" +server=otel +endpoint=http://localhost:4318/v1/metrics +prefix=xrpld" else telemetry_section=" [telemetry] diff --git a/docker/telemetry/workload/expected_metrics.json b/docker/telemetry/workload/expected_metrics.json index 0d5f0aa9ec..2758dc32ac 100644 --- a/docker/telemetry/workload/expected_metrics.json +++ b/docker/telemetry/workload/expected_metrics.json @@ -1,5 +1,5 @@ { - "description": "Expected metric inventory for rippled telemetry validation. Sourced from 09-data-collection-reference.md.", + "description": "Expected metric inventory for xrpld telemetry validation. Metric names use the xrpld_ prefix (the [insight] prefix and OTel resource service name). Sourced from the live Grafana dashboards and MetricsRegistry.cpp.", "spanmetrics": { "description": "SpanMetrics-derived RED metrics from the OTel Collector spanmetrics connector.", "metrics": [ @@ -15,40 +15,48 @@ "span_kind" ], "dimension_labels": [ - "xrpl_rpc_command", - "xrpl_rpc_status", - "xrpl_consensus_mode", - "xrpl_tx_local", - "xrpl_peer_proposal_trusted", - "xrpl_peer_validation_trusted" - ] + "command", + "rpc_status", + "consensus_mode", + "local", + "proposal_trusted", + "validation_trusted", + "tx_type", + "ter_result", + "stage", + "txq_status", + "close_time_correct", + "consensus_state", + "suppressed" + ], + "_dimension_labels_note": "Bare label names as configured in otel-collector-config.yaml spanmetrics dimensions. Informational only (not asserted by the validator)." }, "statsd_gauges": { - "description": "beast::insight gauges emitted via StatsD UDP.", + "description": "beast::insight gauges exported via OTLP/HTTP to the collector (server=otel).", "metrics": [ - "rippled_LedgerMaster_Validated_Ledger_Age", - "rippled_LedgerMaster_Published_Ledger_Age", - "rippled_State_Accounting_Full_duration", - "rippled_Peer_Finder_Active_Inbound_Peers", - "rippled_Peer_Finder_Active_Outbound_Peers", - "rippled_jobq_job_count" + "xrpld_LedgerMaster_Validated_Ledger_Age", + "xrpld_LedgerMaster_Published_Ledger_Age", + "xrpld_State_Accounting_Full_duration", + "xrpld_Peer_Finder_Active_Inbound_Peers", + "xrpld_Peer_Finder_Active_Outbound_Peers", + "xrpld_job_count" ] }, "statsd_counters": { - "description": "beast::insight counters emitted via StatsD UDP. The OTel Prometheus exporter appends _total to monotonic counters.", - "metrics": ["rippled_rpc_requests_total", "rippled_ledger_fetches_total"] + "description": "beast::insight counters exported via OTLP/HTTP. The OTel Prometheus exporter appends _total to monotonic counters.", + "metrics": ["xrpld_rpc_requests_total", "xrpld_ledger_fetches_total"] }, "statsd_histograms": { - "description": "beast::insight timers/histograms emitted via StatsD UDP.", - "metrics": ["rippled_rpc_time", "rippled_rpc_size"] + "description": "beast::insight timers/histograms exported via OTLP/HTTP.", + "metrics": ["xrpld_rpc_time", "xrpld_rpc_size"] }, "overlay_traffic": { "description": "Overlay traffic metrics (subset — full list has 45+ categories).", "metrics": [ - "rippled_total_Bytes_In", - "rippled_total_Bytes_Out", - "rippled_total_Messages_In", - "rippled_total_Messages_Out" + "xrpld_total_Bytes_In", + "xrpld_total_Bytes_Out", + "xrpld_total_Messages_In", + "xrpld_total_Messages_Out" ] }, "phase9_nodestore": { @@ -76,39 +84,39 @@ "metrics": ["xrpld_load_factor_metrics"] }, "parity_validation_agreement": { - "description": "External dashboard parity: validation agreement percentages (push_metrics.py).", + "description": "External dashboard parity: validation agreement percentages (MetricsRegistry).", "metrics": [ "xrpld_validation_agreement{metric=\"agreement_pct_1h\"}", "xrpld_validation_agreement{metric=\"agreement_pct_24h\"}" ] }, "parity_validator_health": { - "description": "External dashboard parity: validator health indicators (push_metrics.py).", + "description": "External dashboard parity: validator health indicators (MetricsRegistry).", "metrics": [ "xrpld_validator_health{metric=\"amendment_blocked\"}", "xrpld_validator_health{metric=\"unl_expiry_days\"}" ] }, "parity_peer_quality": { - "description": "External dashboard parity: peer quality metrics (push_metrics.py).", + "description": "External dashboard parity: peer quality metrics (MetricsRegistry).", "metrics": [ "xrpld_peer_quality{metric=\"peer_latency_p90_ms\"}", "xrpld_peer_quality{metric=\"peers_insane_count\"}" ] }, "parity_ledger_economy": { - "description": "External dashboard parity: ledger economy metrics (push_metrics.py).", + "description": "External dashboard parity: ledger economy metrics (MetricsRegistry).", "metrics": [ "xrpld_ledger_economy{metric=\"base_fee_xrp\"}", "xrpld_ledger_economy{metric=\"transaction_rate\"}" ] }, "parity_state_tracking": { - "description": "External dashboard parity: server state tracking (push_metrics.py).", + "description": "External dashboard parity: server state tracking (MetricsRegistry).", "metrics": ["xrpld_state_tracking{metric=\"state_value\"}"] }, "parity_counters": { - "description": "External dashboard parity: monotonic counters (push_metrics.py).", + "description": "External dashboard parity: monotonic counters (MetricsRegistry).", "metrics": [ "xrpld_ledgers_closed_total", "xrpld_validations_sent_total", @@ -116,24 +124,27 @@ ] }, "parity_storage": { - "description": "External dashboard parity: storage detail metrics (push_metrics.py).", + "description": "External dashboard parity: storage detail metrics (MetricsRegistry).", "metrics": ["xrpld_storage_detail{metric=\"nudb_bytes\"}"] }, "grafana_dashboards": { - "description": "All 13 Grafana dashboards that must render data.", + "description": "All Grafana dashboards that must render data (UIDs as provisioned on disk under docker/telemetry/grafana/dashboards/).", "uids": [ - "rippled-rpc-perf", - "rippled-transactions", - "rippled-consensus", - "rippled-ledger-ops", - "rippled-peer-net", - "rippled-system-node-health", - "rippled-system-network", - "rippled-system-rpc", - "rippled-system-overlay-detail", - "rippled-system-ledger-sync", - "rippled-validator-health", - "rippled-peer-quality" + "xrpld-rpc-perf", + "xrpld-rpc-perf-otel", + "xrpld-transactions", + "xrpld-consensus", + "xrpld-ledger-ops", + "xrpld-peer-net", + "xrpld-peer-quality", + "xrpld-fee-market", + "xrpld-job-queue", + "xrpld-validator-health", + "xrpld-system-node-health", + "xrpld-system-network", + "xrpld-system-rpc", + "xrpld-system-overlay-detail", + "xrpld-system-ledger-sync" ] } } diff --git a/docker/telemetry/workload/expected_spans.json b/docker/telemetry/workload/expected_spans.json index 7c00960dd0..c8f3cc8246 100644 --- a/docker/telemetry/workload/expected_spans.json +++ b/docker/telemetry/workload/expected_spans.json @@ -1,24 +1,18 @@ { - "description": "Expected span inventory for rippled telemetry validation. Sourced from 09-data-collection-reference.md.", + "description": "Expected span inventory for xrpld telemetry validation. Attribute keys follow the 2026-05-13 span-attr naming redesign (bare/underscore form; dotted xrpl.* reserved for resource attributes). Sourced from the *SpanNames.h headers. Spans marked \"optional\": true are conditional — they only fire under traffic the harness may not produce (e.g. gRPC client, missing-ledger fetch, mode transitions) and are not failed when absent.", "spans": [ - { - "name": "rpc.request", - "category": "rpc", - "parent": null, - "required_attributes": [], - "config_flag": "trace_rpc" - }, - { - "name": "rpc.process", - "category": "rpc", - "parent": "rpc.request", - "required_attributes": [], - "config_flag": "trace_rpc" - }, { "name": "rpc.ws_message", "category": "rpc", "parent": null, + "required_attributes": ["command"], + "config_flag": "trace_rpc", + "note": "WebSocket RPC root span. The load generator uses WS, so this is the RPC entry span (not rpc.http_request, which needs an HTTP/JSON-RPC client)." + }, + { + "name": "rpc.process", + "category": "rpc", + "parent": "rpc.ws_message", "required_attributes": [], "config_flag": "trace_rpc" }, @@ -26,126 +20,307 @@ "name": "rpc.command.*", "category": "rpc", "parent": "rpc.process", - "required_attributes": [ - "command", - "version", - "rpc_role", - "rpc_status", - "duration_ms" - ], + "required_attributes": ["command", "version", "rpc_role", "rpc_status"], "config_flag": "trace_rpc", "note": "Wildcard — matches rpc.command.server_info, rpc.command.ledger, etc." }, + { + "name": "rpc.http_request", + "category": "rpc", + "parent": null, + "required_attributes": ["request_payload_size"], + "config_flag": "trace_rpc", + "optional": true, + "note": "HTTP/JSON-RPC root span. The harness load generator is WebSocket-only, so this does not fire." + }, { "name": "tx.process", "category": "transaction", "parent": null, - "required_attributes": ["xrpl.tx.hash", "local", "path"], + "required_attributes": ["tx_hash", "local", "path"], "config_flag": "trace_transactions" }, { "name": "tx.receive", "category": "transaction", "parent": null, - "required_attributes": [ - "xrpl.peer.id", - "xrpl.tx.hash", - "suppressed", - "tx_status" - ], - "config_flag": "trace_transactions" + "required_attributes": ["tx_hash", "peer_id", "suppressed", "tx_status"], + "config_flag": "trace_transactions", + "note": "Cross-node span: parent context propagated from the sender's tx.process via protobuf. Also carries tx_type and peer_version." }, { "name": "tx.apply", "category": "transaction", "parent": "ledger.build", - "required_attributes": ["xrpl.ledger.seq", "tx_count", "tx_failed"], - "config_flag": "trace_transactions" + "required_attributes": ["tx_count", "tx_failed"], + "config_flag": "trace_transactions", + "note": "Apply-step span inside BuildLedger. Carries tx_count/tx_failed (ledger_seq lives on the parent ledger.build span)." + }, + { + "name": "tx.preflight", + "category": "transaction", + "parent": null, + "required_attributes": ["stage", "tx_type", "ter_result"], + "config_flag": "trace_transactions", + "note": "Apply-pipeline stage span (stage=preflight). Shares a deterministic trace_id (txID[0:16]) with tx.preclaim/tx.transactor." + }, + { + "name": "tx.preclaim", + "category": "transaction", + "parent": null, + "required_attributes": ["stage", "tx_type", "ter_result"], + "config_flag": "trace_transactions", + "note": "Apply-pipeline stage span (stage=preclaim)." + }, + { + "name": "tx.transactor", + "category": "transaction", + "parent": null, + "required_attributes": ["stage", "tx_type"], + "config_flag": "trace_transactions", + "note": "Apply-pipeline stage span (stage=apply). Also carries applied." + }, + { + "name": "txq.enqueue", + "category": "transaction", + "parent": "tx.process", + "required_attributes": [ + "tx_hash", + "tx_type", + "txq_status", + "fee_level_paid", + "required_fee_level" + ], + "config_flag": "trace_transactions", + "optional": true, + "note": "Only fires when a tx is queued (fee below open-ledger level). Requires fee escalation — driven by the txq-burst workload phase." + }, + { + "name": "txq.apply_direct", + "category": "transaction", + "parent": "txq.enqueue", + "required_attributes": [], + "config_flag": "trace_transactions", + "optional": true, + "note": "Child of txq.enqueue when the tx applies directly without queueing." + }, + { + "name": "txq.batch_clear", + "category": "transaction", + "parent": "txq.enqueue", + "required_attributes": ["num_cleared"], + "config_flag": "trace_transactions", + "optional": true + }, + { + "name": "txq.accept", + "category": "transaction", + "parent": null, + "required_attributes": ["queue_size", "ledger_changed"], + "config_flag": "trace_transactions", + "optional": true, + "note": "Ledger-close accept loop. Fires on the consensus thread; only meaningful when the queue is non-empty." + }, + { + "name": "txq.accept.tx", + "category": "transaction", + "parent": "txq.accept", + "required_attributes": [ + "tx_hash", + "ter_code", + "retries_remaining", + "txq_status" + ], + "config_flag": "trace_transactions", + "optional": true + }, + { + "name": "txq.cleanup", + "category": "transaction", + "parent": null, + "required_attributes": ["ledger_seq", "expired_count"], + "config_flag": "trace_transactions", + "optional": true + }, + { + "name": "consensus.round", + "category": "consensus", + "parent": null, + "required_attributes": [ + "consensus_ledger_id", + "ledger_seq", + "consensus_mode", + "consensus_round_id", + "consensus_phase" + ], + "config_flag": "trace_consensus", + "note": "Root consensus span created per round. Also carries trace_strategy, previous_ledger_seq, previous_proposers, previous_round_time_ms." + }, + { + "name": "consensus.phase.open", + "category": "consensus", + "parent": "consensus.round", + "required_attributes": [], + "config_flag": "trace_consensus" }, { "name": "consensus.proposal.send", "category": "consensus", - "parent": null, - "required_attributes": ["xrpl.consensus.round"], - "config_flag": "trace_consensus" + "parent": "consensus.round", + "required_attributes": ["consensus_round"], + "config_flag": "trace_consensus", + "note": "Also carries is_bow_out." }, { "name": "consensus.ledger_close", "category": "consensus", - "parent": null, - "required_attributes": ["xrpl.ledger.seq", "xrpl.consensus.mode"], + "parent": "consensus.round", + "required_attributes": ["ledger_seq", "consensus_mode"], + "config_flag": "trace_consensus", + "note": "Also carries tx_count_open, close_time_resolution_ms." + }, + { + "name": "consensus.establish", + "category": "consensus", + "parent": "consensus.round", + "required_attributes": [ + "converge_percent", + "establish_count", + "proposers", + "disputes_count" + ], + "config_flag": "trace_consensus" + }, + { + "name": "consensus.update_positions", + "category": "consensus", + "parent": "consensus.round", + "required_attributes": [ + "converge_percent", + "proposers", + "disputes_count" + ], + "config_flag": "trace_consensus" + }, + { + "name": "consensus.check", + "category": "consensus", + "parent": "consensus.round", + "required_attributes": [ + "agree_count", + "disagree_count", + "threshold_percent", + "consensus_result" + ], "config_flag": "trace_consensus" }, { "name": "consensus.accept", "category": "consensus", - "parent": null, - "required_attributes": [ - "proposers", - "validation_quorum", - "proposers_validated" - ], + "parent": "consensus.round", + "required_attributes": ["proposers", "round_time_ms", "quorum"], "config_flag": "trace_consensus" }, + { + "name": "consensus.accept.apply", + "category": "consensus", + "parent": "consensus.accept", + "required_attributes": [ + "ledger_seq", + "close_time", + "parent_close_time", + "close_time_self", + "close_time_vote_bins", + "resolution_direction" + ], + "config_flag": "trace_consensus", + "note": "Also carries close_time_correct, close_resolution_ms, consensus_state, proposing, round_time_ms, tx_count." + }, { "name": "consensus.validation.send", "category": "consensus", "parent": null, "required_attributes": [ - "xrpl.ledger.seq", + "ledger_seq", "proposing", - "xrpl.ledger.hash", - "validation_full" + "ledger_hash", + "full_validation" ], - "config_flag": "trace_consensus" + "config_flag": "trace_consensus", + "note": "follows-from consensus.accept. ledger_hash is BARE here (consensus-owned attr), unlike peer.validation.receive which uses the shared dotted xrpl.ledger.hash. Also carries validation_sign_time." }, { - "name": "consensus.accept.apply", + "name": "consensus.proposal.receive", "category": "consensus", "parent": null, - "required_attributes": [ - "close_time", - "xrpl.ledger.seq", - "parent_close_time", - "close_time_self", - "close_time_vote_bins", - "resolution_direction" - ], - "config_flag": "trace_consensus" + "required_attributes": [], + "config_flag": "trace_consensus", + "note": "Context-propagated from the sending peer. No required local attributes." + }, + { + "name": "consensus.validation.receive", + "category": "consensus", + "parent": null, + "required_attributes": [], + "config_flag": "trace_consensus", + "note": "Context-propagated from the sending peer. No required local attributes." + }, + { + "name": "consensus.mode_change", + "category": "consensus", + "parent": null, + "required_attributes": ["mode_old", "mode_new"], + "config_flag": "trace_consensus", + "optional": true, + "note": "Only fires on an operating-mode transition; a steady cluster rarely changes mode after warmup." }, { "name": "ledger.build", "category": "ledger", "parent": null, "required_attributes": [ - "xrpl.ledger.seq", - "tx_count", - "tx_failed", + "ledger_seq", "close_time", "close_time_correct", "close_resolution_ms" ], - "config_flag": "trace_ledger" + "config_flag": "trace_ledger", + "note": "tx_count/tx_failed live on the child tx.apply span, not here." }, { "name": "ledger.validate", "category": "ledger", "parent": null, - "required_attributes": ["xrpl.ledger.seq", "validations"], + "required_attributes": ["ledger_seq", "validations"], "config_flag": "trace_ledger" }, { "name": "ledger.store", "category": "ledger", "parent": null, - "required_attributes": ["xrpl.ledger.seq"], + "required_attributes": ["ledger_seq"], "config_flag": "trace_ledger" }, + { + "name": "ledger.acquire", + "category": "ledger", + "parent": null, + "required_attributes": [ + "ledger_seq", + "acquire_reason", + "timeouts", + "peer_count", + "outcome" + ], + "config_flag": "trace_ledger", + "optional": true, + "note": "Only fires when a node must fetch a missing ledger (InboundLedger). A healthy local cluster rarely back-fills history." + }, { "name": "peer.proposal.receive", "category": "peer", "parent": null, - "required_attributes": ["xrpl.peer.id", "proposal_trusted"], + "required_attributes": ["peer_id", "proposal_trusted"], "config_flag": "trace_peer" }, { @@ -153,21 +328,65 @@ "category": "peer", "parent": null, "required_attributes": [ - "xrpl.peer.id", + "peer_id", "validation_trusted", "xrpl.ledger.hash", "validation_full" ], - "config_flag": "trace_peer" + "config_flag": "trace_peer", + "note": "Uses the shared dotted xrpl.ledger.hash constant (intentionally dotted, unlike consensus.validation.send)." + }, + { + "name": "pathfind.request", + "category": "pathfind", + "parent": null, + "required_attributes": [ + "pathfind_source_account", + "pathfind_dest_account" + ], + "config_flag": "trace_rpc", + "note": "Fires on ripple_path_find / path_find RPC. Driven by the ripple_path_find load in rpc_load_generator.py." + }, + { + "name": "pathfind.compute", + "category": "pathfind", + "parent": "pathfind.request", + "required_attributes": ["pathfind_fast"], + "config_flag": "trace_rpc" + }, + { + "name": "pathfind.discover", + "category": "pathfind", + "parent": "pathfind.compute", + "required_attributes": ["pathfind_search_level", "pathfind_num_paths"], + "config_flag": "trace_rpc" + }, + { + "name": "pathfind.update_all", + "category": "pathfind", + "parent": null, + "required_attributes": ["pathfind_ledger_index", "pathfind_num_requests"], + "config_flag": "trace_rpc", + "optional": true, + "note": "Async recomputation at ledger close; only fires when there are active path_find subscriptions (the one-shot ripple_path_find load does not register one)." + }, + { + "name": "grpc.*", + "category": "grpc", + "parent": null, + "required_attributes": ["method", "grpc_role", "grpc_status"], + "config_flag": "trace_rpc", + "optional": true, + "note": "Wildcard — grpc.. The harness has no gRPC client, so these do not fire. Tracked for completeness." } ], "parent_child_relationships": [ { - "parent": "rpc.request", + "parent": "rpc.ws_message", "child": "rpc.process", - "description": "RPC request contains processing span", + "description": "WebSocket message contains processing span", "skip": true, - "skip_reason": "rpc.request and rpc.process run on different threads (onRequest posts a coroutine to JobQueue for processRequest). Span context is not propagated across the thread boundary. Requires C++ fix to capture and forward the span context through the coroutine lambda." + "skip_reason": "rpc.ws_message and rpc.process run on different threads (the WS handler posts a coroutine to JobQueue for processing). Span context is not propagated across the thread boundary. Requires a C++ fix to capture and forward the span context through the coroutine lambda." }, { "parent": "rpc.process", @@ -178,8 +397,23 @@ "parent": "ledger.build", "child": "tx.apply", "description": "Ledger build contains transaction application" + }, + { + "parent": "consensus.round", + "child": "consensus.accept", + "description": "Consensus round contains the accept sub-span" + }, + { + "parent": "consensus.accept", + "child": "consensus.accept.apply", + "description": "Accept contains the ledger-apply sub-span" + }, + { + "parent": "pathfind.request", + "child": "pathfind.compute", + "description": "Pathfind request contains the compute sub-span" } ], - "total_span_types": 17, - "total_unique_attributes": 37 + "total_span_types": 40, + "total_unique_attributes": 58 } diff --git a/docker/telemetry/workload/regression-metrics.json b/docker/telemetry/workload/regression-metrics.json index de969fbd9e..0fd4e29060 100644 --- a/docker/telemetry/workload/regression-metrics.json +++ b/docker/telemetry/workload/regression-metrics.json @@ -6,7 +6,7 @@ "_unit": "ms", "_quantiles": [0.5, 0.95, 0.99], "names": [ - "rpc.request", + "rpc.ws_message", "rpc.process", "tx.process", "tx.apply", diff --git a/docker/telemetry/workload/rpc_load_generator.py b/docker/telemetry/workload/rpc_load_generator.py index 3180de65b1..7b5e22f631 100644 --- a/docker/telemetry/workload/rpc_load_generator.py +++ b/docker/telemetry/workload/rpc_load_generator.py @@ -11,6 +11,7 @@ Command distribution (default weights): 15% Explorer: ledger, ledger_data 10% TX lookups: tx, account_tx 5% DEX queries: book_offers, amm_info + 3% Pathfinding: ripple_path_find Usage: python3 rpc_load_generator.py --endpoints ws://localhost:6006 --rate 50 --duration 120 @@ -61,6 +62,10 @@ DEFAULT_WEIGHTS: dict[str, int] = { # 5% DEX queries "book_offers": 3, "amm_info": 2, + # Pathfinding — exercises the pathfind.request/compute/discover spans. + # ripple_path_find is the synchronous (one-shot) variant that fits this + # fire-one-request WS client; path_find is a streaming subscription. + "ripple_path_find": 3, } # Well-known genesis account for queries that require an account parameter. @@ -162,7 +167,7 @@ def build_rpc_request(command: str) -> dict[str, Any]: req["limit"] = 5 elif command == "tx": # Use a dummy hash — returns "txnNotFound" error but still exercises - # the full RPC span pipeline (rpc.request -> rpc.process -> rpc.command.tx). + # the full RPC span pipeline (rpc.ws_message -> rpc.process -> rpc.command.tx). req["transaction"] = "0" * 64 req["binary"] = False elif command == "account_tx": @@ -184,6 +189,13 @@ def build_rpc_request(command: str) -> dict[str, Any]: "currency": "USD", "issuer": GENESIS_ACCOUNT, } + elif command == "ripple_path_find": + # Self-to-self XRP path search. It returns no usable paths, but the + # server still runs the full pathfinding pipeline (pathfind.request -> + # pathfind.compute -> pathfind.discover), which is what we trace. + req["source_account"] = GENESIS_ACCOUNT + req["destination_account"] = GENESIS_ACCOUNT + req["destination_amount"] = "1000000" # 1 XRP in drops return req diff --git a/docker/telemetry/workload/run-full-validation.sh b/docker/telemetry/workload/run-full-validation.sh index 5953a15df3..458bf70983 100755 --- a/docker/telemetry/workload/run-full-validation.sh +++ b/docker/telemetry/workload/run-full-validation.sh @@ -296,9 +296,13 @@ trace_peer=1 trace_ledger=1 [insight] -server=statsd -address=127.0.0.1:8125 -prefix=rippled +# Native OTel metrics via OTLP/HTTP. The collector has no StatsD receiver +# (metrics pipeline is [otlp, spanmetrics]), so beast::insight must export +# over OTLP for system metrics to reach Prometheus. prefix=xrpld matches the +# OTel resource service name and the xrpld_* names the dashboards query. +server=otel +endpoint=http://localhost:4318/v1/metrics +prefix=xrpld [rpc_startup] { "command": "log_level", "severity": "warning" } diff --git a/docker/telemetry/workload/validate_telemetry.py b/docker/telemetry/workload/validate_telemetry.py index 32f8eaff4e..f1cad503dc 100644 --- a/docker/telemetry/workload/validate_telemetry.py +++ b/docker/telemetry/workload/validate_telemetry.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Telemetry Validation Suite for rippled. +"""Telemetry Validation Suite for xrpld. Validates that the full telemetry stack is emitting expected data after a workload run. Queries Tempo (spans), Prometheus (metrics), Loki (logs), @@ -63,7 +63,7 @@ class CheckResult: """Result of a single validation check. Attributes: - name: Check identifier (e.g., "span.rpc.request"). + name: Check identifier (e.g., "span.rpc.ws_message"). category: Validation category (span, metric, log, dashboard). passed: Whether the check passed. message: Human-readable description of the result. @@ -243,16 +243,16 @@ async def validate_spans( data = await resp.json() tag_values = data.get("tagValues", []) services = [tv.get("value", "") for tv in tag_values] - has_rippled = "rippled" in services + has_xrpld = "xrpld" in services report.add( CheckResult( name="span.service_registration", category="span", - passed=has_rippled, + passed=has_xrpld, message=( - f"Service 'rippled' registered (found: {services})" - if has_rippled - else f"Service 'rippled' NOT found (found: {services})" + f"Service 'xrpld' registered (found: {services})" + if has_xrpld + else f"Service 'xrpld' NOT found (found: {services})" ), ) ) @@ -267,7 +267,7 @@ async def validate_spans( ) return - # Diagnostic: list all available operations (span names) for the rippled + # Diagnostic: list all available operations (span names) for the xrpld # service. This output appears in CI logs and helps debug missing-span # failures without needing to reproduce the full stack locally. try: @@ -285,21 +285,64 @@ async def validate_spans( except Exception as exc: logger.warning("Failed to fetch Tempo operations: %s", exc) + # Concrete probe names for wildcard span entries. Exact-match TraceQL can't + # match a literal "*", so a representative operation name is substituted. + # Wildcards without a known concrete example (e.g. grpc. when no + # gRPC client runs) are skipped when marked optional. + wildcard_probes = {"rpc.command.*": "rpc.command.server_info"} + # Check each expected span. for span_def in expected["spans"]: span_name = span_def["name"] - # For wildcard spans (rpc.command.*), search with a concrete example. + is_optional = span_def.get("optional", False) + check_name = f"span.{span_name}" + if "*" in span_name: - operation = "rpc.command.server_info" - check_name = f"span.{span_name}" + operation = wildcard_probes.get(span_name) + if operation is None: + # No concrete probe. Optional wildcards (e.g. grpc.*) are skipped; + # a required one would be a config error worth surfacing. + if is_optional: + logger.info( + "[SKIP] %s: optional wildcard span with no concrete " + "probe (not exercised by the workload)", + check_name, + ) + continue + report.add( + CheckResult( + name=check_name, + category="span", + passed=False, + message=f"{span_name}: required wildcard has no probe name", + ) + ) + continue else: operation = span_name - check_name = f"span.{span_name}" try: - query = '{resource.service.name="rippled" && name="' + operation + '"}' + query = '{resource.service.name="xrpld" && name="' + operation + '"}' traces = await _tempo_search(session, tempo_url, query, limit=5) count = len(traces) + # Optional spans only fire under specific traffic (mode changes, + # missing-ledger fetch, fee escalation). Absence is not a failure — + # mirror the parent-child "skip" handling so CI stays green. + if count == 0 and is_optional: + logger.info( + "[SKIP] %s: optional span not emitted under this workload", + check_name, + ) + report.add( + CheckResult( + name=check_name, + category="span", + passed=True, + message=f"{span_name}: optional, not emitted (skipped)", + details={"trace_count": 0, "optional": True}, + ) + ) + continue report.add( CheckResult( name=check_name, @@ -407,7 +450,7 @@ async def _validate_parent_child( try: # Query traces for the parent span. - query = '{resource.service.name="rippled" && name="' + parent_name + '"}' + query = '{resource.service.name="xrpld" && name="' + parent_name + '"}' traces = await _tempo_search(session, tempo_url, query, limit=3) if not traces: @@ -488,11 +531,11 @@ async def validate_metrics( ) as resp: label_data = await resp.json() all_metrics = label_data.get("data", []) - # Log rippled-related and Phase 9 metrics for debugging. + # Log xrpld-related and Phase 9 metrics for debugging. relevant = [ m for m in all_metrics - if "rippled" in m.lower() + if "xrpld" in m.lower() or m.startswith( ( "rpc_method", @@ -611,7 +654,7 @@ async def validate_log_trace_correlation( # Check 1: Any logs with trace_id exist. try: params = { - "query": '{job="rippled"} |= "trace_id="', + "query": '{job="xrpld"} |= "trace_id="', "limit": 5, "direction": "backward", } @@ -650,7 +693,7 @@ async def validate_log_trace_correlation( traces = await _tempo_search( session, tempo_url, - '{resource.service.name="rippled"}', + '{resource.service.name="xrpld"}', limit=1, ) @@ -659,7 +702,7 @@ async def validate_log_trace_correlation( if trace_id: # Search Loki for this trace_id. loki_params = { - "query": f'{{job="rippled"}} |= "{trace_id}"', + "query": f'{{job="xrpld"}} |= "{trace_id}"', "limit": 5, "direction": "backward", } @@ -795,7 +838,7 @@ async def validate_span_durations( traces = await _tempo_search( session, tempo_url, - '{resource.service.name="rippled"}', + '{resource.service.name="xrpld"}', limit=5, ) @@ -868,15 +911,21 @@ async def validate_span_durations( # Span attributes that external dashboards (validator-health, peer-quality, # system-node-health) depend on. Each entry maps a span name to the # attributes that must be present for external dashboard panels to render. +# Keys follow the 2026-05-13 span-attr naming redesign (bare/underscore form; +# dotted xrpl.* reserved for resource attributes). The amendment_blocked, +# server_state, and proposers_validated values that earlier external-dashboard +# work tracked are NOT span attributes — they exist only as MetricsRegistry +# metrics (xrpld_validator_health{metric="amendment_blocked"}, +# xrpld_state_tracking{metric="state_value"}, etc.), so they are validated by +# PARITY_VALUE_SANITY below rather than as span attributes here. PARITY_SPAN_ATTRS: list[dict[str, str]] = [ - {"span": "rpc.command.server_info", "attr": "xrpl.node.amendment_blocked"}, - {"span": "rpc.command.server_info", "attr": "xrpl.node.server_state"}, - {"span": "tx.receive", "attr": "xrpl.peer.version"}, - {"span": "consensus.validation.send", "attr": "xrpl.validation.ledger_hash"}, - {"span": "consensus.validation.send", "attr": "xrpl.validation.full"}, - {"span": "peer.validation.receive", "attr": "xrpl.peer.validation.ledger_hash"}, - {"span": "consensus.accept", "attr": "xrpl.consensus.validation_quorum"}, - {"span": "consensus.accept", "attr": "xrpl.consensus.proposers_validated"}, + {"span": "tx.receive", "attr": "peer_version"}, + {"span": "consensus.validation.send", "attr": "ledger_hash"}, + {"span": "consensus.validation.send", "attr": "full_validation"}, + # peer.validation.receive uses the shared dotted xrpl.ledger.hash constant + # (intentionally dotted, unlike consensus.validation.send's bare ledger_hash). + {"span": "peer.validation.receive", "attr": "xrpl.ledger.hash"}, + {"span": "consensus.accept", "attr": "quorum"}, ] # Value sanity bounds for external-parity metrics. Each entry specifies a @@ -934,7 +983,7 @@ async def validate_parity_span_attrs( check_name = f"parity.span_attr.{span_name}.{attr_name}" try: - query = '{resource.service.name="rippled" && name="' + span_name + '"}' + query = '{resource.service.name="xrpld" && name="' + span_name + '"}' traces = await _tempo_search(session, tempo_url, query, limit=5) if not traces: @@ -1124,7 +1173,7 @@ async def run_validation( def parse_args() -> argparse.Namespace: """Parse command-line arguments.""" parser = argparse.ArgumentParser( - description="Telemetry Validation Suite for rippled", + description="Telemetry Validation Suite for xrpld", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: diff --git a/docker/telemetry/workload/workload-profiles.json b/docker/telemetry/workload/workload-profiles.json index 040be9073d..0acd499cf9 100644 --- a/docker/telemetry/workload/workload-profiles.json +++ b/docker/telemetry/workload/workload-profiles.json @@ -37,6 +37,16 @@ "weights": { "Payment": 70, "OfferCreate": 20, "TrustSet": 10 } } }, + { + "name": "txq-burst", + "description": "Single-type Payment burst at high TPS to force open-ledger fee escalation and TxQ queueing, exercising the txq.* spans (txq.enqueue / txq.accept / txq.accept.tx / txq.cleanup)", + "duration_sec": 30, + "rpc": { "rate": 5, "weights": { "fee": 100 } }, + "tx": { + "tps": 60, + "weights": { "Payment": 100 } + } + }, { "name": "mixed-peak", "description": "Realistic peak load — consensus and ledger ops under stress", diff --git a/docker/telemetry/workload/xrpld-validator.cfg.template b/docker/telemetry/workload/xrpld-validator.cfg.template index 5e8352d9b7..9ded7d28a5 100644 --- a/docker/telemetry/workload/xrpld-validator.cfg.template +++ b/docker/telemetry/workload/xrpld-validator.cfg.template @@ -9,8 +9,8 @@ # {{VALIDATION_SEED}} — Validator seed from key generation # {{VALIDATORS_FILE}} — Path to shared validators.txt # {{IPS_FIXED}} — Peer addresses (one per line) -# {{OTEL_ENDPOINT}} — OTel Collector OTLP/HTTP endpoint -# {{STATSD_ADDRESS}} — StatsD UDP address (host:port) +# {{OTEL_ENDPOINT}} — OTel Collector OTLP/HTTP traces endpoint +# {{OTEL_METRICS_ENDPOINT}} — OTel Collector OTLP/HTTP metrics endpoint # {{LOG_LEVEL}} — Log level (debug, info, warning, error) [server] @@ -74,11 +74,14 @@ trace_consensus=1 trace_peer=1 trace_ledger=1 -# --- StatsD metrics (beast::insight) --- +# --- Native OTel metrics (beast::insight over OTLP/HTTP) --- +# The collector has no StatsD receiver (metrics pipeline is [otlp, spanmetrics]), +# so beast::insight exports natively over OTLP. prefix=xrpld matches the OTel +# resource service name and the xrpld_* names the dashboards query. [insight] -server=statsd -address={{STATSD_ADDRESS}} -prefix=rippled +server=otel +endpoint={{OTEL_METRICS_ENDPOINT}} +prefix=xrpld [rpc_startup] { "command": "log_level", "severity": "{{LOG_LEVEL}}" }