mirror of
https://github.com/XRPLF/rippled.git
synced 2026-03-15 09:12:25 +00:00
Add comprehensive workload harness for end-to-end validation of the
Phases 1-9 telemetry stack:
Task 10.1 — Multi-node test harness:
- docker-compose.workload.yaml with full OTel stack (Collector, Jaeger,
Tempo, Prometheus, Loki, Grafana)
- generate-validator-keys.sh for automated key generation
- xrpld-validator.cfg.template for node configuration
Task 10.2 — RPC load generator:
- rpc_load_generator.py with WebSocket client, configurable rates,
realistic command distribution (40% health, 30% wallet, 15% explorer,
10% tx lookups, 5% DEX), W3C traceparent injection
Task 10.3 — Transaction submitter:
- tx_submitter.py with 10 transaction types (Payment, OfferCreate,
OfferCancel, TrustSet, NFTokenMint, NFTokenCreateOffer, EscrowCreate,
EscrowFinish, AMMCreate, AMMDeposit), auto-funded test accounts
Task 10.4 — Telemetry validation suite:
- validate_telemetry.py checking spans (Jaeger), metrics (Prometheus),
log-trace correlation (Loki), dashboards (Grafana)
- expected_spans.json (17 span types, 22 attributes, 3 hierarchies)
- expected_metrics.json (SpanMetrics, StatsD, Phase 9, dashboards)
Task 10.5 — Performance benchmark suite:
- benchmark.sh for baseline vs telemetry comparison
- collect_system_metrics.sh for CPU/memory/latency sampling
- Thresholds: <3% CPU, <5MB memory, <2ms RPC p99, <5% TPS, <1% consensus
Task 10.6 — CI integration:
- telemetry-validation.yml GitHub Actions workflow
- run-full-validation.sh orchestrator script
- Manual trigger + telemetry branch auto-trigger
Task 10.7 — Documentation:
- workload/README.md with quick start and tool reference
- Updated telemetry-runbook.md with validation and benchmark sections
- Updated 09-data-collection-reference.md with validation inventory
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
138 lines
3.6 KiB
YAML
138 lines
3.6 KiB
YAML
# Docker Compose workload harness for Phase 10 telemetry validation.
|
|
#
|
|
# Runs a 5-node validator cluster with full OTel telemetry stack:
|
|
# - 5 rippled validator nodes (consensus network)
|
|
# - OTel Collector (traces + StatsD metrics)
|
|
# - Jaeger (trace search UI)
|
|
# - Tempo (production trace backend)
|
|
# - Prometheus (metrics)
|
|
# - Loki (log aggregation for log-trace correlation)
|
|
# - Grafana (dashboards + trace/log exploration)
|
|
#
|
|
# Usage:
|
|
# # Start the harness (requires pre-built xrpld image or mount binary):
|
|
# docker compose -f docker/telemetry/docker-compose.workload.yaml up -d
|
|
#
|
|
# # Or use the orchestrator:
|
|
# docker/telemetry/workload/run-full-validation.sh
|
|
#
|
|
# Prerequisites:
|
|
# - xrpld binary built with -DXRPL_ENABLE_TELEMETRY=ON
|
|
# - Validator keys generated via generate-validator-keys.sh
|
|
# - Node configs generated by run-full-validation.sh
|
|
|
|
version: "3.8"
|
|
|
|
services:
|
|
# ---------------------------------------------------------------------------
|
|
# Telemetry Backend Stack
|
|
# ---------------------------------------------------------------------------
|
|
|
|
otel-collector:
|
|
image: otel/opentelemetry-collector-contrib:latest
|
|
command: ["--config=/etc/otel-collector-config.yaml"]
|
|
ports:
|
|
- "4317:4317" # OTLP gRPC
|
|
- "4318:4318" # OTLP HTTP
|
|
- "8125:8125/udp" # StatsD UDP (beast::insight metrics)
|
|
- "8889:8889" # Prometheus metrics endpoint
|
|
- "13133:13133" # Health check
|
|
volumes:
|
|
- ../otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro
|
|
depends_on:
|
|
- jaeger
|
|
- tempo
|
|
networks:
|
|
- workload-net
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:13133/"]
|
|
interval: 5s
|
|
timeout: 3s
|
|
retries: 10
|
|
|
|
jaeger:
|
|
image: jaegertracing/all-in-one:latest
|
|
environment:
|
|
- COLLECTOR_OTLP_ENABLED=true
|
|
ports:
|
|
- "16686:16686" # Jaeger UI
|
|
- "14250:14250" # gRPC
|
|
networks:
|
|
- workload-net
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:16686/"]
|
|
interval: 5s
|
|
timeout: 3s
|
|
retries: 10
|
|
|
|
tempo:
|
|
image: grafana/tempo:2.7.2
|
|
command: ["-config.file=/etc/tempo.yaml"]
|
|
ports:
|
|
- "3200:3200" # Tempo HTTP API
|
|
volumes:
|
|
- ../tempo.yaml:/etc/tempo.yaml:ro
|
|
- tempo-data:/var/tempo
|
|
networks:
|
|
- workload-net
|
|
|
|
prometheus:
|
|
image: prom/prometheus:latest
|
|
ports:
|
|
- "9090:9090"
|
|
volumes:
|
|
- ../prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
depends_on:
|
|
otel-collector:
|
|
condition: service_healthy
|
|
networks:
|
|
- workload-net
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:9090/-/healthy"]
|
|
interval: 5s
|
|
timeout: 3s
|
|
retries: 10
|
|
|
|
loki:
|
|
image: grafana/loki:2.9.4
|
|
ports:
|
|
- "3100:3100" # Loki HTTP API
|
|
command: ["-config.file=/etc/loki/local-config.yaml"]
|
|
networks:
|
|
- workload-net
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:3100/ready"]
|
|
interval: 5s
|
|
timeout: 3s
|
|
retries: 10
|
|
|
|
grafana:
|
|
image: grafana/grafana:latest
|
|
environment:
|
|
- GF_AUTH_ANONYMOUS_ENABLED=true
|
|
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
|
|
ports:
|
|
- "3000:3000"
|
|
volumes:
|
|
- ../grafana/provisioning:/etc/grafana/provisioning:ro
|
|
- ../grafana/dashboards:/var/lib/grafana/dashboards:ro
|
|
depends_on:
|
|
- jaeger
|
|
- tempo
|
|
- prometheus
|
|
- loki
|
|
networks:
|
|
- workload-net
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:3000/api/health"]
|
|
interval: 5s
|
|
timeout: 3s
|
|
retries: 10
|
|
|
|
volumes:
|
|
tempo-data:
|
|
|
|
networks:
|
|
workload-net:
|
|
driver: bridge
|