mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-06 02:07:07 +00:00
The Phase 10 validation harness had drifted from the code's recording surface and the telemetry-validation CI job was failing before it could build. CI fix (telemetry-validation.yml): - Replace nonexistent local action ./.github/actions/print-env with the remote XRPLF/actions/print-build-env (the build-xrpld job failed in 56s on this). - Sync prepare-runner and upload-artifact action SHAs to the canonical workflow. Recording-surface reconciliation (docker/telemetry/workload/): - Migrate span attributes from dotted xrpl.<domain>.<field> to the bare/underscore form introduced by the 2026-05-13 span-attr naming redesign (tx_hash, peer_id, ledger_seq, consensus_mode, consensus_round, full_validation, quorum, ...). Dotted xrpl.ledger.hash is retained only on peer.validation.receive (shared constant), while consensus.validation.send uses bare ledger_hash. - Fix attribute placement: tx.apply carries tx_count/tx_failed (not ledger_seq); ledger.build carries ledger_seq/close_* (not tx_count/tx_failed). - Replace the phantom rpc.request span with the real WS root rpc.ws_message; drop the never-emitted duration_ms; rebuild the parent-child map accordingly. - Add the new spans the code emits: apply-pipeline stage spans (tx.preflight/preclaim/transactor with stage/tx_type/ter_result), txq.*, consensus sub-spans (round/establish/update_positions/check/phase.open), ledger.acquire, grpc.*, pathfind.*. Conditional spans are marked optional so they are skipped (not failed) when the workload does not exercise them. - validate_telemetry.py: service.name and Loki job label rippled -> xrpld; fix PARITY_SPAN_ATTRS (rename the 4 real attrs, drop the 3 that are metrics not span attrs); add optional-span handling that skips missing optional spans while still validating attributes when present. - expected_metrics.json: rippled_ -> xrpld_ on all beast::insight/overlay metrics, xrpld_job_count, the 15 on-disk xrpld-* dashboard UIDs, and the real bare spanmetrics dimension labels. - regression-metrics.json + baseline-timings.json: rpc.request -> rpc.ws_message. Metrics pipeline fix: - Switch node [insight] config from server=statsd/prefix=rippled to server=otel + /v1/metrics endpoint + prefix=xrpld across run-full-validation.sh, xrpld-validator.cfg.template, benchmark.sh and the workload compose. The collector has no StatsD receiver, so system metrics only reach Prometheus over OTLP. Synthetic load for new spans: - Add ripple_path_find to the RPC load generator (drives pathfind.* spans). - Add a high-TPS txq-burst workload phase to force fee escalation (drives txq.*). All facts verified against the *SpanNames.h headers and a live xrpld node + collector (Tempo service.name=xrpld, tx.preflight attrs [stage,ter_result,tx_type], 279 xrpld_ Prometheus metrics and zero rippled_). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
102 lines
2.9 KiB
YAML
102 lines
2.9 KiB
YAML
# Docker Compose workload harness for Phase 10 telemetry validation.
|
|
#
|
|
# Runs a 5-node validator cluster with full OTel telemetry stack:
|
|
# - 5 rippled validator nodes (consensus network)
|
|
# - OTel Collector (traces + native OTLP metrics)
|
|
# - Tempo (trace backend + search API)
|
|
# - Prometheus (metrics)
|
|
# - Loki (log aggregation for log-trace correlation)
|
|
# - Grafana (dashboards + trace/log exploration)
|
|
#
|
|
# Usage:
|
|
# # Start the harness (requires pre-built xrpld image or mount binary):
|
|
# docker compose -f docker/telemetry/docker-compose.workload.yaml up -d
|
|
#
|
|
# # Or use the orchestrator:
|
|
# docker/telemetry/workload/run-full-validation.sh
|
|
#
|
|
# Prerequisites:
|
|
# - xrpld binary built with -DXRPL_ENABLE_TELEMETRY=ON
|
|
# - Validator keys generated via generate-validator-keys.sh
|
|
# - Node configs generated by run-full-validation.sh
|
|
#
|
|
# Note: No Docker healthchecks are defined here. The orchestrator script
|
|
# (run-full-validation.sh) polls each service endpoint directly from the
|
|
# host, which avoids issues with missing curl/wget in container images.
|
|
|
|
services:
|
|
# ---------------------------------------------------------------------------
|
|
# Telemetry Backend Stack
|
|
# ---------------------------------------------------------------------------
|
|
|
|
otel-collector:
|
|
image: otel/opentelemetry-collector-contrib:latest
|
|
command: ["--config=/etc/otel-collector-config.yaml"]
|
|
ports:
|
|
- "4317:4317" # OTLP gRPC
|
|
- "4318:4318" # OTLP HTTP (traces + beast::insight metrics)
|
|
- "8889:8889" # Prometheus metrics endpoint
|
|
- "13133:13133" # Health check
|
|
volumes:
|
|
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro
|
|
# Mount the validation workdir so filelog receiver can tail node logs.
|
|
- /tmp/xrpld-validation:/var/log/rippled:ro
|
|
depends_on:
|
|
- tempo
|
|
networks:
|
|
- workload-net
|
|
|
|
tempo:
|
|
image: grafana/tempo:2.7.2
|
|
command: ["-config.file=/etc/tempo.yaml"]
|
|
ports:
|
|
- "3200:3200" # Tempo HTTP API
|
|
volumes:
|
|
- ./tempo.yaml:/etc/tempo.yaml:ro
|
|
- tempo-data:/var/tempo
|
|
networks:
|
|
- workload-net
|
|
|
|
prometheus:
|
|
image: prom/prometheus:latest
|
|
ports:
|
|
- "9090:9090"
|
|
volumes:
|
|
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
depends_on:
|
|
- otel-collector
|
|
networks:
|
|
- workload-net
|
|
|
|
loki:
|
|
image: grafana/loki:3.4.2
|
|
ports:
|
|
- "3100:3100" # Loki HTTP API
|
|
command: ["-config.file=/etc/loki/local-config.yaml"]
|
|
networks:
|
|
- workload-net
|
|
|
|
grafana:
|
|
image: grafana/grafana:latest
|
|
environment:
|
|
- GF_AUTH_ANONYMOUS_ENABLED=true
|
|
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
|
|
ports:
|
|
- "3000:3000"
|
|
volumes:
|
|
- ./grafana/provisioning:/etc/grafana/provisioning:ro
|
|
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
|
|
depends_on:
|
|
- tempo
|
|
- prometheus
|
|
- loki
|
|
networks:
|
|
- workload-net
|
|
|
|
volumes:
|
|
tempo-data:
|
|
|
|
networks:
|
|
workload-net:
|
|
driver: bridge
|