Files
rippled/docker/telemetry/docker-compose.workload.yaml
2026-03-31 22:32:02 +01:00

116 lines
3.2 KiB
YAML

# Docker Compose workload harness for Phase 10 telemetry validation.
#
# Runs a 5-node validator cluster with full OTel telemetry stack:
# - 5 rippled validator nodes (consensus network)
# - OTel Collector (traces + StatsD metrics)
# - Jaeger (trace search UI)
# - Tempo (production trace backend)
# - Prometheus (metrics)
# - Loki (log aggregation for log-trace correlation)
# - Grafana (dashboards + trace/log exploration)
#
# Usage:
# # Start the harness (requires pre-built xrpld image or mount binary):
# docker compose -f docker/telemetry/docker-compose.workload.yaml up -d
#
# # Or use the orchestrator:
# docker/telemetry/workload/run-full-validation.sh
#
# Prerequisites:
# - xrpld binary built with -DXRPL_ENABLE_TELEMETRY=ON
# - Validator keys generated via generate-validator-keys.sh
# - Node configs generated by run-full-validation.sh
#
# Note: No Docker healthchecks are defined here. The orchestrator script
# (run-full-validation.sh) polls each service endpoint directly from the
# host, which avoids issues with missing curl/wget in container images.
services:
# ---------------------------------------------------------------------------
# Telemetry Backend Stack
# ---------------------------------------------------------------------------
otel-collector:
image: otel/opentelemetry-collector-contrib:latest
command: ["--config=/etc/otel-collector-config.yaml"]
ports:
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
- "8125:8125/udp" # StatsD UDP (beast::insight metrics)
- "8889:8889" # Prometheus metrics endpoint
- "13133:13133" # Health check
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro
# Mount the validation workdir so filelog receiver can tail node logs.
- /tmp/xrpld-validation:/var/log/rippled:ro
depends_on:
- jaeger
- tempo
networks:
- workload-net
jaeger:
image: jaegertracing/all-in-one:latest
environment:
- COLLECTOR_OTLP_ENABLED=true
ports:
- "16686:16686" # Jaeger UI
- "14250:14250" # gRPC
networks:
- workload-net
tempo:
image: grafana/tempo:2.7.2
command: ["-config.file=/etc/tempo.yaml"]
ports:
- "3200:3200" # Tempo HTTP API
volumes:
- ./tempo.yaml:/etc/tempo.yaml:ro
- tempo-data:/var/tempo
networks:
- workload-net
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
depends_on:
- otel-collector
networks:
- workload-net
loki:
image: grafana/loki:3.4.2
ports:
- "3100:3100" # Loki HTTP API
command: ["-config.file=/etc/loki/local-config.yaml"]
networks:
- workload-net
grafana:
image: grafana/grafana:latest
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
ports:
- "3000:3000"
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning:ro
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
depends_on:
- jaeger
- tempo
- prometheus
- loki
networks:
- workload-net
volumes:
tempo-data:
networks:
workload-net:
driver: bridge