#!/usr/bin/env bash # run-full-validation.sh — Orchestrates the full telemetry validation pipeline. # # Sequence: # 1. Start the observability stack (OTel Collector, Tempo, Prometheus, Loki, Grafana) # 2. Start a multi-node rippled cluster with full telemetry enabled # 3. Wait for consensus # 4. Run workload orchestrator (RPC load, TX submission, propagation wait) # 5. Run the telemetry validation suite # 6. Capture OTel timings and compare against committed baseline # 7. (Optional) Run the performance overhead benchmark # # Usage: # ./run-full-validation.sh --xrpld /path/to/xrpld # ./run-full-validation.sh --xrpld /path/to/xrpld --with-benchmark # ./run-full-validation.sh --xrpld /path/to/xrpld --skip-regression # ./run-full-validation.sh --cleanup # # Exit codes: # 0 — All validation checks and the regression gate passed # 1 — Validation checks failed OR the regression gate detected a regression # 2 — Infrastructure error (cluster/stack failed to start, timing capture failed) set -euo pipefail # --------------------------------------------------------------------------- # Colored output helpers # --------------------------------------------------------------------------- log() { printf "\033[1;34m[VALIDATE]\033[0m %s\n" "$*"; } ok() { printf "\033[1;32m[VALIDATE]\033[0m %s\n" "$*"; } warn() { printf "\033[1;33m[VALIDATE]\033[0m %s\n" "$*"; } fail() { printf "\033[1;31m[VALIDATE]\033[0m %s\n" "$*"; } die() { printf "\033[1;31m[VALIDATE]\033[0m %s\n" "$*" >&2 exit 2 } # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TELEMETRY_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" REPO_ROOT="$(cd "$TELEMETRY_DIR/../.." && pwd)" COMPOSE_FILE="$TELEMETRY_DIR/docker-compose.workload.yaml" WORKDIR="/tmp/xrpld-validation" XRPLD="${XRPLD:-$REPO_ROOT/.build/xrpld}" NUM_NODES=5 RPC_PORT_BASE=5005 WS_PORT_BASE=6006 PEER_PORT_BASE=51235 RPC_RATE=50 RPC_DURATION=120 TX_TPS=5 TX_DURATION=120 WITH_BENCHMARK=false SKIP_LOKI=false SKIP_REGRESSION=false WORKLOAD_PROFILE="full-validation" REPORT_DIR="$WORKDIR/reports" # Rate window handed to Prometheus `rate()` when capturing timings. Keep # this close to the active workload duration so histogram buckets cover # the measurement window; longer windows dilute short-lived regressions. REGRESSION_WINDOW="${REGRESSION_WINDOW:-3m}" BASELINE_FILE="${BASELINE_FILE:-$SCRIPT_DIR/baselines/baseline-timings.json}" THRESHOLDS_FILE="${THRESHOLDS_FILE:-$SCRIPT_DIR/regression-thresholds.json}" METRICS_FILE="${METRICS_FILE:-$SCRIPT_DIR/regression-metrics.json}" GENESIS_ACCOUNT="rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh" GENESIS_SEED="snoPBrXtMeMyMHUVTgbuqAfg1SUTb" # --------------------------------------------------------------------------- # Argument parsing # --------------------------------------------------------------------------- usage() { echo "Usage: $0 [OPTIONS]" echo "" echo "Options:" echo " --xrpld PATH Path to xrpld binary" echo " --nodes NUM Number of validator nodes (default: 5)" echo " --rpc-rate RPS RPC load rate (default: 50)" echo " --rpc-duration SECS RPC load duration (default: 120)" echo " --tx-tps TPS Transaction submit rate (default: 5)" echo " --tx-duration SECS Transaction submit duration (default: 120)" echo " --profile NAME Workload profile (default: full-validation)" echo " --with-benchmark Also run performance overhead benchmark (telemetry off vs on)" echo " --skip-loki Skip Loki log-trace correlation checks" echo " --skip-regression Skip the OTel-baseline regression gate" echo " --cleanup Tear down everything and exit" echo " -h, --help Show this help" exit 0 } while [ $# -gt 0 ]; do case "$1" in --xrpld) XRPLD="$2" shift 2 ;; --nodes) NUM_NODES="$2" shift 2 ;; --rpc-rate) RPC_RATE="$2" shift 2 ;; --rpc-duration) RPC_DURATION="$2" shift 2 ;; --tx-tps) TX_TPS="$2" shift 2 ;; --tx-duration) TX_DURATION="$2" shift 2 ;; --profile) WORKLOAD_PROFILE="$2" shift 2 ;; --with-benchmark) WITH_BENCHMARK=true shift ;; --skip-loki) SKIP_LOKI=true shift ;; --skip-regression) SKIP_REGRESSION=true shift ;; --cleanup) # Cleanup mode log "Cleaning up..." pkill -f "$WORKDIR" 2>/dev/null || true docker compose -f "$COMPOSE_FILE" down 2>/dev/null || true rm -rf "$WORKDIR" ok "Cleanup complete." exit 0 ;; -h | --help) usage ;; *) die "Unknown option: $1" ;; esac done # --------------------------------------------------------------------------- # Prerequisites # --------------------------------------------------------------------------- log "Checking prerequisites..." [ -x "$XRPLD" ] || die "xrpld binary not found: $XRPLD" command -v docker >/dev/null 2>&1 || die "docker not found" docker compose version >/dev/null 2>&1 || die "docker compose (v2) not found" command -v python3 >/dev/null 2>&1 || die "python3 not found" command -v curl >/dev/null 2>&1 || die "curl not found" command -v jq >/dev/null 2>&1 || die "jq not found" [ -f "$COMPOSE_FILE" ] || die "docker-compose.workload.yaml not found" # Install Python dependencies. log "Installing Python dependencies..." pip3 install -q -r "$SCRIPT_DIR/requirements.txt" 2>/dev/null || pip install -q -r "$SCRIPT_DIR/requirements.txt" 2>/dev/null || warn "Could not install Python dependencies — they may already be present" ok "Prerequisites verified." # --------------------------------------------------------------------------- # Cleanup previous run # --------------------------------------------------------------------------- log "Cleaning up previous run..." pkill -f "$WORKDIR" 2>/dev/null || true sleep 2 rm -rf "$WORKDIR" mkdir -p "$WORKDIR" "$REPORT_DIR" # --------------------------------------------------------------------------- # Step 1: Start observability stack # --------------------------------------------------------------------------- log "Step 1: Starting observability stack..." docker compose -f "$COMPOSE_FILE" up -d log "Waiting for OTel Collector..." for attempt in $(seq 1 30); do status=$(curl -so /dev/null -w '%{http_code}' http://localhost:4318/ 2>/dev/null || echo 000) if [ "$status" != "000" ]; then ok "OTel Collector ready (attempt $attempt)" break fi [ "$attempt" -eq 30 ] && die "OTel Collector not ready after 30s" sleep 1 done log "Waiting for Tempo..." for attempt in $(seq 1 30); do if curl -sf "http://localhost:3200/ready" >/dev/null 2>&1; then ok "Tempo ready (attempt $attempt)" break fi [ "$attempt" -eq 30 ] && die "Tempo not ready after 30s" sleep 1 done log "Waiting for Prometheus..." for attempt in $(seq 1 30); do if curl -sf "http://localhost:9090/-/healthy" >/dev/null 2>&1; then ok "Prometheus ready (attempt $attempt)" break fi [ "$attempt" -eq 30 ] && die "Prometheus not ready after 30s" sleep 1 done # --------------------------------------------------------------------------- # Step 2: Generate validator keys and start cluster # --------------------------------------------------------------------------- log "Step 2: Starting $NUM_NODES-node validator cluster..." bash "$SCRIPT_DIR/generate-validator-keys.sh" "$XRPLD" "$NUM_NODES" "$WORKDIR" for i in $(seq 1 "$NUM_NODES"); do NODE_DIR="$WORKDIR/node$i" mkdir -p "$NODE_DIR/nudb" "$NODE_DIR/db" RPC_PORT=$((RPC_PORT_BASE + i - 1)) WS_PORT=$((WS_PORT_BASE + i - 1)) PEER_PORT=$((PEER_PORT_BASE + i - 1)) SEED=$(jq -r ".[$((i - 1))].seed" "$WORKDIR/validator-keys.json") # Build ips_fixed. IPS_FIXED="" for j in $(seq 1 "$NUM_NODES"); do if [ "$j" -ne "$i" ]; then IPS_FIXED="${IPS_FIXED}127.0.0.1 $((PEER_PORT_BASE + j - 1)) " fi done cat >"$NODE_DIR/xrpld.cfg" <"$NODE_DIR/stdout.log" 2>&1 & echo $! >"$NODE_DIR/xrpld.pid" log " Node $i: RPC=$RPC_PORT WS=$WS_PORT Peer=$PEER_PORT PID=$!" done # --------------------------------------------------------------------------- # Step 3: Wait for consensus # --------------------------------------------------------------------------- log "Step 3: Waiting for consensus..." for attempt in $(seq 1 120); do ready=0 for i in $(seq 1 "$NUM_NODES"); do port=$((RPC_PORT_BASE + i - 1)) state=$(curl -sf "http://localhost:$port" \ -d '{"method":"server_info"}' 2>/dev/null | jq -r '.result.info.server_state' 2>/dev/null || echo "") if [ "$state" = "proposing" ]; then ready=$((ready + 1)) fi done if [ "$ready" -ge "$NUM_NODES" ]; then ok "All $NUM_NODES nodes proposing (attempt $attempt)" break fi if [ "$attempt" -eq 120 ]; then warn "Consensus timeout — $ready/$NUM_NODES nodes ready" fi printf "\r %d/%d nodes proposing..." "$ready" "$NUM_NODES" sleep 1 done echo "" # Wait for first validated ledger. log "Waiting for validated ledger..." for attempt in $(seq 1 60); do val_seq=$(curl -sf "http://localhost:$RPC_PORT_BASE" \ -d '{"method":"server_info"}' 2>/dev/null | jq -r '.result.info.validated_ledger.seq // 0' 2>/dev/null || echo 0) if [ "$val_seq" -gt 2 ] 2>/dev/null; then ok "Validated ledger: seq $val_seq" break fi [ "$attempt" -eq 60 ] && warn "No validated ledger after 60s" sleep 1 done # --------------------------------------------------------------------------- # Step 4: Run workload orchestrator # --------------------------------------------------------------------------- log "Step 4: Running workload orchestrator (profile: $WORKLOAD_PROFILE)..." WS_ENDPOINTS="" for i in $(seq 1 "$NUM_NODES"); do WS_ENDPOINTS="$WS_ENDPOINTS ws://localhost:$((WS_PORT_BASE + i - 1))" done python3 "$SCRIPT_DIR/workload_orchestrator.py" \ --profile "$WORKLOAD_PROFILE" \ --endpoints $WS_ENDPOINTS \ --report "$REPORT_DIR/workload-report.json" \ --report-dir "$REPORT_DIR" || warn "Workload orchestrator returned non-zero exit" ok "Workload orchestration complete." # --------------------------------------------------------------------------- # Step 5: Run telemetry validation suite # --------------------------------------------------------------------------- log "Step 5: Running telemetry validation suite..." VALIDATION_ARGS="--report $REPORT_DIR/validation-report.json" if [ "$SKIP_LOKI" = true ]; then VALIDATION_ARGS="$VALIDATION_ARGS --skip-loki" fi VALIDATION_EXIT=0 python3 "$SCRIPT_DIR/validate_telemetry.py" $VALIDATION_ARGS || VALIDATION_EXIT=$? if [ "$VALIDATION_EXIT" -eq 0 ]; then ok "All telemetry validation checks passed!" else fail "Some telemetry validation checks failed (exit $VALIDATION_EXIT)" fi # --------------------------------------------------------------------------- # Step 6: Capture OTel timings and run the regression comparison # --------------------------------------------------------------------------- # This step ALWAYS captures timings (so CI always has an artifact from which # to bootstrap/refresh the committed baseline). The comparator then either: # - prints the paste-me JSON when the baseline is a placeholder, or # - enforces thresholds and fails the run on regression. # Use --skip-regression to opt out (e.g. for ad-hoc local exploration). TIMINGS_FILE="$REPORT_DIR/timings.json" REGRESSION_REPORT="$REPORT_DIR/regression-report.json" REGRESSION_EXIT=0 if [ "$SKIP_REGRESSION" != true ]; then log "Step 6: Capturing OTel timings from Prometheus..." if python3 "$SCRIPT_DIR/capture_timings.py" \ --prometheus "http://localhost:9090" \ --metrics "$METRICS_FILE" \ --output "$TIMINGS_FILE" \ --window "$REGRESSION_WINDOW" \ --profile "$WORKLOAD_PROFILE"; then ok "Timings captured: $TIMINGS_FILE" else fail "Failed to capture timings — skipping regression comparison." REGRESSION_EXIT=2 SKIP_REGRESSION=true fi fi if [ "$SKIP_REGRESSION" != true ]; then log "Comparing against baseline $BASELINE_FILE..." python3 "$SCRIPT_DIR/compare_to_baseline.py" \ --timings "$TIMINGS_FILE" \ --baseline "$BASELINE_FILE" \ --thresholds "$THRESHOLDS_FILE" \ --report "$REGRESSION_REPORT" || REGRESSION_EXIT=$? if [ "$REGRESSION_EXIT" -eq 0 ]; then ok "Regression gate passed (or baseline placeholder — paste JSON printed above)." elif [ "$REGRESSION_EXIT" -eq 1 ]; then fail "Regression detected — see $REGRESSION_REPORT" else fail "Regression comparator internal error (exit $REGRESSION_EXIT)" fi else warn "Regression gate skipped." fi # --------------------------------------------------------------------------- # Step 7: (Optional) Run overhead benchmark # --------------------------------------------------------------------------- if [ "$WITH_BENCHMARK" = true ]; then log "Step 7: Running performance benchmark..." bash "$SCRIPT_DIR/benchmark.sh" \ --xrpld "$XRPLD" \ --duration 120 \ --nodes 3 \ --output "$REPORT_DIR" || warn "Benchmark returned non-zero exit" fi # --------------------------------------------------------------------------- # Summary # --------------------------------------------------------------------------- echo "" echo "===========================================================" echo " FULL VALIDATION RESULTS" echo "===========================================================" echo "" echo " Reports directory: $REPORT_DIR" echo "" ls -la "$REPORT_DIR/" 2>/dev/null || true echo "" echo " Observability stack is running:" echo " Tempo: http://localhost:3200" echo " Grafana: http://localhost:3000" echo " Prometheus: http://localhost:9090" echo "" echo " xrpld nodes ($NUM_NODES) are running:" for i in $(seq 1 "$NUM_NODES"); do rpc=$((RPC_PORT_BASE + i - 1)) ws=$((WS_PORT_BASE + i - 1)) pid=$(cat "$WORKDIR/node$i/xrpld.pid" 2>/dev/null || echo 'unknown') echo " Node $i: RPC=$rpc WS=$ws PID=$pid" done echo "" echo " To tear down:" echo " $0 --cleanup" echo "" echo "===========================================================" # Fail the run if EITHER validation or the regression gate failed. The # `[ "$VAR" -gt N ]` comparison works here because exit codes are numeric. FINAL_EXIT=0 if [ "$VALIDATION_EXIT" -ne 0 ]; then FINAL_EXIT="$VALIDATION_EXIT" fi if [ "$REGRESSION_EXIT" -ne 0 ] && [ "$FINAL_EXIT" -eq 0 ]; then FINAL_EXIT="$REGRESSION_EXIT" fi exit "$FINAL_EXIT"