From 67e061fb3b2f3bb08b478ac8ecda4edfb2c950fb Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 12 Mar 2026 22:08:47 +0000 Subject: [PATCH] Remove Docker healthchecks from workload compose The container images (otel-collector-contrib, jaeger, etc.) don't reliably include curl or wget. The healthcheck failures caused docker compose up -d to block on the collector's service_healthy condition, preventing Prometheus and Grafana from starting. The orchestrator script (run-full-validation.sh) already polls each service endpoint from the host with its own curl-based readiness loops, so Docker-level healthchecks are redundant. Also removed the obsolete `version: "3.8"` key (docker compose v2). Co-Authored-By: Claude Opus 4.6 --- docker/telemetry/docker-compose.workload.yaml | 34 +++---------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/docker/telemetry/docker-compose.workload.yaml b/docker/telemetry/docker-compose.workload.yaml index f46de310c5..a80ede4c57 100644 --- a/docker/telemetry/docker-compose.workload.yaml +++ b/docker/telemetry/docker-compose.workload.yaml @@ -20,8 +20,10 @@ # - xrpld binary built with -DXRPL_ENABLE_TELEMETRY=ON # - Validator keys generated via generate-validator-keys.sh # - Node configs generated by run-full-validation.sh - -version: "3.8" +# +# Note: No Docker healthchecks are defined here. The orchestrator script +# (run-full-validation.sh) polls each service endpoint directly from the +# host, which avoids issues with missing curl/wget in container images. services: # --------------------------------------------------------------------------- @@ -46,11 +48,6 @@ services: - tempo networks: - workload-net - healthcheck: - test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133/"] - interval: 5s - timeout: 3s - retries: 10 jaeger: image: jaegertracing/all-in-one:latest @@ -61,11 +58,6 @@ services: - "14250:14250" # gRPC networks: - workload-net - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:16686/"] - interval: 5s - timeout: 3s - retries: 10 tempo: image: grafana/tempo:2.7.2 @@ -85,15 +77,9 @@ services: volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro depends_on: - otel-collector: - condition: service_healthy + - otel-collector networks: - workload-net - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9090/-/healthy"] - interval: 5s - timeout: 3s - retries: 10 loki: image: grafana/loki:3.4.2 @@ -102,11 +88,6 @@ services: command: ["-config.file=/etc/loki/local-config.yaml"] networks: - workload-net - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:3100/ready"] - interval: 5s - timeout: 3s - retries: 10 grafana: image: grafana/grafana:latest @@ -125,11 +106,6 @@ services: - loki networks: - workload-net - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:3000/api/health"] - interval: 5s - timeout: 3s - retries: 10 volumes: tempo-data: