From 67e061fb3b2f3bb08b478ac8ecda4edfb2c950fb Mon Sep 17 00:00:00 2001
From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
Date: Thu, 12 Mar 2026 22:08:47 +0000
Subject: [PATCH] Remove Docker healthchecks from workload compose

The container images (otel-collector-contrib, jaeger, etc.) don't
reliably include curl or wget. The healthcheck failures caused
docker compose up -d to block on the collector's service_healthy
condition, preventing Prometheus and Grafana from starting.

The orchestrator script (run-full-validation.sh) already polls each
service endpoint from the host with its own curl-based readiness
loops, so Docker-level healthchecks are redundant.

Also removed the obsolete `version: "3.8"` key (docker compose v2).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docker/telemetry/docker-compose.workload.yaml | 34 +++----------------
 1 file changed, 5 insertions(+), 29 deletions(-)

diff --git a/docker/telemetry/docker-compose.workload.yaml b/docker/telemetry/docker-compose.workload.yaml
index f46de310c5..a80ede4c57 100644
--- a/docker/telemetry/docker-compose.workload.yaml
+++ b/docker/telemetry/docker-compose.workload.yaml
@@ -20,8 +20,10 @@
 #   - xrpld binary built with -DXRPL_ENABLE_TELEMETRY=ON
 #   - Validator keys generated via generate-validator-keys.sh
 #   - Node configs generated by run-full-validation.sh
-
-version: "3.8"
+#
+# Note: No Docker healthchecks are defined here. The orchestrator script
+# (run-full-validation.sh) polls each service endpoint directly from the
+# host, which avoids issues with missing curl/wget in container images.
 
 services:
   # ---------------------------------------------------------------------------
@@ -46,11 +48,6 @@ services:
       - tempo
     networks:
       - workload-net
-    healthcheck:
-      test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133/"]
-      interval: 5s
-      timeout: 3s
-      retries: 10
 
   jaeger:
     image: jaegertracing/all-in-one:latest
@@ -61,11 +58,6 @@ services:
       - "14250:14250" # gRPC
     networks:
       - workload-net
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:16686/"]
-      interval: 5s
-      timeout: 3s
-      retries: 10
 
   tempo:
     image: grafana/tempo:2.7.2
@@ -85,15 +77,9 @@ services:
     volumes:
       - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
     depends_on:
-      otel-collector:
-        condition: service_healthy
+      - otel-collector
     networks:
       - workload-net
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9090/-/healthy"]
-      interval: 5s
-      timeout: 3s
-      retries: 10
 
   loki:
     image: grafana/loki:3.4.2
@@ -102,11 +88,6 @@ services:
     command: ["-config.file=/etc/loki/local-config.yaml"]
     networks:
       - workload-net
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:3100/ready"]
-      interval: 5s
-      timeout: 3s
-      retries: 10
 
   grafana:
     image: grafana/grafana:latest
@@ -125,11 +106,6 @@ services:
       - loki
     networks:
       - workload-net
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:3000/api/health"]
-      interval: 5s
-      timeout: 3s
-      retries: 10
 
 volumes:
   tempo-data: