Files
rippled/docker/telemetry/workload/run-full-validation.sh
Pratik Mankawde 577d1f8a21 fix: address review findings in regression gate
- capture_timings.py: fail when captured/total ratio < 50%
  (--min-capture-ratio). Prevents silent pass on unreachable Prometheus.
- run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so
  the final exit code reflects it. Update exit code docs in header.
- compare_to_baseline.py: extract _skip_delta helper to bring
  compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound
  resolution (use explicit None check instead of `or`). Remove dead
  variable override_prefix_key.
- prom_queries.py: extract _build_simple_entries and _build_job_entries
  to bring build_query_plan under 80 lines. Fix module docstring return
  type example. Use aiohttp.ClientTimeout instead of bare int.
- telemetry-validation.yml: add set -euo pipefail to regression summary
  step; guard jq calls with -e flag and fallback; fail on missing
  baseline file; emit ::warning annotation when timings.json missing.
- baselines/README.md: document the placeholder field.
2026-04-24 19:36:15 +01:00

464 lines
16 KiB
Bash
Executable File

#!/usr/bin/env bash
# run-full-validation.sh — Orchestrates the full telemetry validation pipeline.
#
# Sequence:
# 1. Start the observability stack (OTel Collector, Tempo, Prometheus, Loki, Grafana)
# 2. Start a multi-node rippled cluster with full telemetry enabled
# 3. Wait for consensus
# 4. Run workload orchestrator (RPC load, TX submission, propagation wait)
# 5. Run the telemetry validation suite
# 6. Capture OTel timings and compare against committed baseline
# 7. (Optional) Run the performance overhead benchmark
#
# Usage:
# ./run-full-validation.sh --xrpld /path/to/xrpld
# ./run-full-validation.sh --xrpld /path/to/xrpld --with-benchmark
# ./run-full-validation.sh --xrpld /path/to/xrpld --skip-regression
# ./run-full-validation.sh --cleanup
#
# Exit codes:
# 0 — All validation checks and the regression gate passed
# 1 — Validation checks failed OR the regression gate detected a regression
# 2 — Infrastructure error (cluster/stack failed to start, timing capture failed)
set -euo pipefail
# ---------------------------------------------------------------------------
# Colored output helpers
# ---------------------------------------------------------------------------
log() { printf "\033[1;34m[VALIDATE]\033[0m %s\n" "$*"; }
ok() { printf "\033[1;32m[VALIDATE]\033[0m %s\n" "$*"; }
warn() { printf "\033[1;33m[VALIDATE]\033[0m %s\n" "$*"; }
fail() { printf "\033[1;31m[VALIDATE]\033[0m %s\n" "$*"; }
die() { printf "\033[1;31m[VALIDATE]\033[0m %s\n" "$*" >&2; exit 2; }
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TELEMETRY_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
REPO_ROOT="$(cd "$TELEMETRY_DIR/../.." && pwd)"
COMPOSE_FILE="$TELEMETRY_DIR/docker-compose.workload.yaml"
WORKDIR="/tmp/xrpld-validation"
XRPLD="${XRPLD:-$REPO_ROOT/.build/xrpld}"
NUM_NODES=5
RPC_PORT_BASE=5005
WS_PORT_BASE=6006
PEER_PORT_BASE=51235
RPC_RATE=50
RPC_DURATION=120
TX_TPS=5
TX_DURATION=120
WITH_BENCHMARK=false
SKIP_LOKI=false
SKIP_REGRESSION=false
WORKLOAD_PROFILE="full-validation"
REPORT_DIR="$WORKDIR/reports"
# Rate window handed to Prometheus `rate()` when capturing timings. Keep
# this close to the active workload duration so histogram buckets cover
# the measurement window; longer windows dilute short-lived regressions.
REGRESSION_WINDOW="${REGRESSION_WINDOW:-3m}"
BASELINE_FILE="${BASELINE_FILE:-$SCRIPT_DIR/baselines/baseline-timings.json}"
THRESHOLDS_FILE="${THRESHOLDS_FILE:-$SCRIPT_DIR/regression-thresholds.json}"
METRICS_FILE="${METRICS_FILE:-$SCRIPT_DIR/regression-metrics.json}"
GENESIS_ACCOUNT="rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh"
GENESIS_SEED="snoPBrXtMeMyMHUVTgbuqAfg1SUTb"
# ---------------------------------------------------------------------------
# Argument parsing
# ---------------------------------------------------------------------------
usage() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --xrpld PATH Path to xrpld binary"
echo " --nodes NUM Number of validator nodes (default: 5)"
echo " --rpc-rate RPS RPC load rate (default: 50)"
echo " --rpc-duration SECS RPC load duration (default: 120)"
echo " --tx-tps TPS Transaction submit rate (default: 5)"
echo " --tx-duration SECS Transaction submit duration (default: 120)"
echo " --profile NAME Workload profile (default: full-validation)"
echo " --with-benchmark Also run performance overhead benchmark (telemetry off vs on)"
echo " --skip-loki Skip Loki log-trace correlation checks"
echo " --skip-regression Skip the OTel-baseline regression gate"
echo " --cleanup Tear down everything and exit"
echo " -h, --help Show this help"
exit 0
}
while [ $# -gt 0 ]; do
case "$1" in
--xrpld) XRPLD="$2"; shift 2 ;;
--nodes) NUM_NODES="$2"; shift 2 ;;
--rpc-rate) RPC_RATE="$2"; shift 2 ;;
--rpc-duration) RPC_DURATION="$2"; shift 2 ;;
--tx-tps) TX_TPS="$2"; shift 2 ;;
--tx-duration) TX_DURATION="$2"; shift 2 ;;
--profile) WORKLOAD_PROFILE="$2"; shift 2 ;;
--with-benchmark) WITH_BENCHMARK=true; shift ;;
--skip-loki) SKIP_LOKI=true; shift ;;
--skip-regression) SKIP_REGRESSION=true; shift ;;
--cleanup) # Cleanup mode
log "Cleaning up..."
pkill -f "$WORKDIR" 2>/dev/null || true
docker compose -f "$COMPOSE_FILE" down 2>/dev/null || true
rm -rf "$WORKDIR"
ok "Cleanup complete."
exit 0
;;
-h|--help) usage ;;
*) die "Unknown option: $1" ;;
esac
done
# ---------------------------------------------------------------------------
# Prerequisites
# ---------------------------------------------------------------------------
log "Checking prerequisites..."
[ -x "$XRPLD" ] || die "xrpld binary not found: $XRPLD"
command -v docker >/dev/null 2>&1 || die "docker not found"
docker compose version >/dev/null 2>&1 || die "docker compose (v2) not found"
command -v python3 >/dev/null 2>&1 || die "python3 not found"
command -v curl >/dev/null 2>&1 || die "curl not found"
command -v jq >/dev/null 2>&1 || die "jq not found"
[ -f "$COMPOSE_FILE" ] || die "docker-compose.workload.yaml not found"
# Install Python dependencies.
log "Installing Python dependencies..."
pip3 install -q -r "$SCRIPT_DIR/requirements.txt" 2>/dev/null || \
pip install -q -r "$SCRIPT_DIR/requirements.txt" 2>/dev/null || \
warn "Could not install Python dependencies — they may already be present"
ok "Prerequisites verified."
# ---------------------------------------------------------------------------
# Cleanup previous run
# ---------------------------------------------------------------------------
log "Cleaning up previous run..."
pkill -f "$WORKDIR" 2>/dev/null || true
sleep 2
rm -rf "$WORKDIR"
mkdir -p "$WORKDIR" "$REPORT_DIR"
# ---------------------------------------------------------------------------
# Step 1: Start observability stack
# ---------------------------------------------------------------------------
log "Step 1: Starting observability stack..."
docker compose -f "$COMPOSE_FILE" up -d
log "Waiting for OTel Collector..."
for attempt in $(seq 1 30); do
status=$(curl -so /dev/null -w '%{http_code}' http://localhost:4318/ 2>/dev/null || echo 000)
if [ "$status" != "000" ]; then
ok "OTel Collector ready (attempt $attempt)"
break
fi
[ "$attempt" -eq 30 ] && die "OTel Collector not ready after 30s"
sleep 1
done
log "Waiting for Tempo..."
for attempt in $(seq 1 30); do
if curl -sf "http://localhost:3200/ready" >/dev/null 2>&1; then
ok "Tempo ready (attempt $attempt)"
break
fi
[ "$attempt" -eq 30 ] && die "Tempo not ready after 30s"
sleep 1
done
log "Waiting for Prometheus..."
for attempt in $(seq 1 30); do
if curl -sf "http://localhost:9090/-/healthy" >/dev/null 2>&1; then
ok "Prometheus ready (attempt $attempt)"
break
fi
[ "$attempt" -eq 30 ] && die "Prometheus not ready after 30s"
sleep 1
done
# ---------------------------------------------------------------------------
# Step 2: Generate validator keys and start cluster
# ---------------------------------------------------------------------------
log "Step 2: Starting $NUM_NODES-node validator cluster..."
bash "$SCRIPT_DIR/generate-validator-keys.sh" "$XRPLD" "$NUM_NODES" "$WORKDIR"
for i in $(seq 1 "$NUM_NODES"); do
NODE_DIR="$WORKDIR/node$i"
mkdir -p "$NODE_DIR/nudb" "$NODE_DIR/db"
RPC_PORT=$((RPC_PORT_BASE + i - 1))
WS_PORT=$((WS_PORT_BASE + i - 1))
PEER_PORT=$((PEER_PORT_BASE + i - 1))
SEED=$(jq -r ".[$((i-1))].seed" "$WORKDIR/validator-keys.json")
# Build ips_fixed.
IPS_FIXED=""
for j in $(seq 1 "$NUM_NODES"); do
if [ "$j" -ne "$i" ]; then
IPS_FIXED="${IPS_FIXED}127.0.0.1 $((PEER_PORT_BASE + j - 1))
"
fi
done
cat > "$NODE_DIR/xrpld.cfg" <<EOCFG
[server]
port_rpc
port_ws
port_peer
[port_rpc]
port = $RPC_PORT
ip = 127.0.0.1
admin = 127.0.0.1
protocol = http
[port_ws]
port = $WS_PORT
ip = 127.0.0.1
admin = 127.0.0.1
protocol = ws
[port_peer]
port = $PEER_PORT
ip = 0.0.0.0
protocol = peer
[node_db]
type=NuDB
path=$NODE_DIR/nudb
online_delete=256
[database_path]
$NODE_DIR/db
[debug_logfile]
$NODE_DIR/debug.log
[validation_seed]
$SEED
[validators_file]
$WORKDIR/validators.txt
[ips]
${IPS_FIXED}
[telemetry]
enabled=1
service_instance_id=validator-${i}
endpoint=http://localhost:4318/v1/traces
exporter=otlp_http
sampling_ratio=1.0
batch_size=512
batch_delay_ms=2000
max_queue_size=2048
trace_rpc=1
trace_transactions=1
trace_consensus=1
trace_peer=1
trace_ledger=1
[insight]
server=statsd
address=127.0.0.1:8125
prefix=rippled
[rpc_startup]
{ "command": "log_level", "severity": "warning" }
[signing_support]
true
[ssl_verify]
0
EOCFG
"$XRPLD" --conf "$NODE_DIR/xrpld.cfg" --start > "$NODE_DIR/stdout.log" 2>&1 &
echo $! > "$NODE_DIR/xrpld.pid"
log " Node $i: RPC=$RPC_PORT WS=$WS_PORT Peer=$PEER_PORT PID=$!"
done
# ---------------------------------------------------------------------------
# Step 3: Wait for consensus
# ---------------------------------------------------------------------------
log "Step 3: Waiting for consensus..."
for attempt in $(seq 1 120); do
ready=0
for i in $(seq 1 "$NUM_NODES"); do
port=$((RPC_PORT_BASE + i - 1))
state=$(curl -sf "http://localhost:$port" \
-d '{"method":"server_info"}' 2>/dev/null \
| jq -r '.result.info.server_state' 2>/dev/null || echo "")
if [ "$state" = "proposing" ]; then
ready=$((ready + 1))
fi
done
if [ "$ready" -ge "$NUM_NODES" ]; then
ok "All $NUM_NODES nodes proposing (attempt $attempt)"
break
fi
if [ "$attempt" -eq 120 ]; then
warn "Consensus timeout — $ready/$NUM_NODES nodes ready"
fi
printf "\r %d/%d nodes proposing..." "$ready" "$NUM_NODES"
sleep 1
done
echo ""
# Wait for first validated ledger.
log "Waiting for validated ledger..."
for attempt in $(seq 1 60); do
val_seq=$(curl -sf "http://localhost:$RPC_PORT_BASE" \
-d '{"method":"server_info"}' 2>/dev/null \
| jq -r '.result.info.validated_ledger.seq // 0' 2>/dev/null || echo 0)
if [ "$val_seq" -gt 2 ] 2>/dev/null; then
ok "Validated ledger: seq $val_seq"
break
fi
[ "$attempt" -eq 60 ] && warn "No validated ledger after 60s"
sleep 1
done
# ---------------------------------------------------------------------------
# Step 4: Run workload orchestrator
# ---------------------------------------------------------------------------
log "Step 4: Running workload orchestrator (profile: $WORKLOAD_PROFILE)..."
WS_ENDPOINTS=""
for i in $(seq 1 "$NUM_NODES"); do
WS_ENDPOINTS="$WS_ENDPOINTS ws://localhost:$((WS_PORT_BASE + i - 1))"
done
python3 "$SCRIPT_DIR/workload_orchestrator.py" \
--profile "$WORKLOAD_PROFILE" \
--endpoints $WS_ENDPOINTS \
--report "$REPORT_DIR/workload-report.json" \
--report-dir "$REPORT_DIR" || \
warn "Workload orchestrator returned non-zero exit"
ok "Workload orchestration complete."
# ---------------------------------------------------------------------------
# Step 5: Run telemetry validation suite
# ---------------------------------------------------------------------------
log "Step 5: Running telemetry validation suite..."
VALIDATION_ARGS="--report $REPORT_DIR/validation-report.json"
if [ "$SKIP_LOKI" = true ]; then
VALIDATION_ARGS="$VALIDATION_ARGS --skip-loki"
fi
VALIDATION_EXIT=0
python3 "$SCRIPT_DIR/validate_telemetry.py" $VALIDATION_ARGS || VALIDATION_EXIT=$?
if [ "$VALIDATION_EXIT" -eq 0 ]; then
ok "All telemetry validation checks passed!"
else
fail "Some telemetry validation checks failed (exit $VALIDATION_EXIT)"
fi
# ---------------------------------------------------------------------------
# Step 6: Capture OTel timings and run the regression comparison
# ---------------------------------------------------------------------------
# This step ALWAYS captures timings (so CI always has an artifact from which
# to bootstrap/refresh the committed baseline). The comparator then either:
# - prints the paste-me JSON when the baseline is a placeholder, or
# - enforces thresholds and fails the run on regression.
# Use --skip-regression to opt out (e.g. for ad-hoc local exploration).
TIMINGS_FILE="$REPORT_DIR/timings.json"
REGRESSION_REPORT="$REPORT_DIR/regression-report.json"
REGRESSION_EXIT=0
if [ "$SKIP_REGRESSION" != true ]; then
log "Step 6: Capturing OTel timings from Prometheus..."
if python3 "$SCRIPT_DIR/capture_timings.py" \
--prometheus "http://localhost:9090" \
--metrics "$METRICS_FILE" \
--output "$TIMINGS_FILE" \
--window "$REGRESSION_WINDOW" \
--profile "$WORKLOAD_PROFILE"
then
ok "Timings captured: $TIMINGS_FILE"
else
fail "Failed to capture timings — skipping regression comparison."
REGRESSION_EXIT=2
SKIP_REGRESSION=true
fi
fi
if [ "$SKIP_REGRESSION" != true ]; then
log "Comparing against baseline $BASELINE_FILE..."
python3 "$SCRIPT_DIR/compare_to_baseline.py" \
--timings "$TIMINGS_FILE" \
--baseline "$BASELINE_FILE" \
--thresholds "$THRESHOLDS_FILE" \
--report "$REGRESSION_REPORT" || REGRESSION_EXIT=$?
if [ "$REGRESSION_EXIT" -eq 0 ]; then
ok "Regression gate passed (or baseline placeholder — paste JSON printed above)."
elif [ "$REGRESSION_EXIT" -eq 1 ]; then
fail "Regression detected — see $REGRESSION_REPORT"
else
fail "Regression comparator internal error (exit $REGRESSION_EXIT)"
fi
else
warn "Regression gate skipped."
fi
# ---------------------------------------------------------------------------
# Step 7: (Optional) Run overhead benchmark
# ---------------------------------------------------------------------------
if [ "$WITH_BENCHMARK" = true ]; then
log "Step 7: Running performance benchmark..."
bash "$SCRIPT_DIR/benchmark.sh" \
--xrpld "$XRPLD" \
--duration 120 \
--nodes 3 \
--output "$REPORT_DIR" || \
warn "Benchmark returned non-zero exit"
fi
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo ""
echo "==========================================================="
echo " FULL VALIDATION RESULTS"
echo "==========================================================="
echo ""
echo " Reports directory: $REPORT_DIR"
echo ""
ls -la "$REPORT_DIR/" 2>/dev/null || true
echo ""
echo " Observability stack is running:"
echo " Tempo: http://localhost:3200"
echo " Grafana: http://localhost:3000"
echo " Prometheus: http://localhost:9090"
echo ""
echo " xrpld nodes ($NUM_NODES) are running:"
for i in $(seq 1 "$NUM_NODES"); do
rpc=$((RPC_PORT_BASE + i - 1))
ws=$((WS_PORT_BASE + i - 1))
pid=$(cat "$WORKDIR/node$i/xrpld.pid" 2>/dev/null || echo 'unknown')
echo " Node $i: RPC=$rpc WS=$ws PID=$pid"
done
echo ""
echo " To tear down:"
echo " $0 --cleanup"
echo ""
echo "==========================================================="
# Fail the run if EITHER validation or the regression gate failed. The
# `[ "$VAR" -gt N ]` comparison works here because exit codes are numeric.
FINAL_EXIT=0
if [ "$VALIDATION_EXIT" -ne 0 ]; then
FINAL_EXIT="$VALIDATION_EXIT"
fi
if [ "$REGRESSION_EXIT" -ne 0 ] && [ "$FINAL_EXIT" -eq 0 ]; then
FINAL_EXIT="$REGRESSION_EXIT"
fi
exit "$FINAL_EXIT"