mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-03 08:46:46 +00:00
- capture_timings.py: fail when captured/total ratio < 50% (--min-capture-ratio). Prevents silent pass on unreachable Prometheus. - run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so the final exit code reflects it. Update exit code docs in header. - compare_to_baseline.py: extract _skip_delta helper to bring compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound resolution (use explicit None check instead of `or`). Remove dead variable override_prefix_key. - prom_queries.py: extract _build_simple_entries and _build_job_entries to bring build_query_plan under 80 lines. Fix module docstring return type example. Use aiohttp.ClientTimeout instead of bare int. - telemetry-validation.yml: add set -euo pipefail to regression summary step; guard jq calls with -e flag and fallback; fail on missing baseline file; emit ::warning annotation when timings.json missing. - baselines/README.md: document the placeholder field.
464 lines
16 KiB
Bash
Executable File
464 lines
16 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# run-full-validation.sh — Orchestrates the full telemetry validation pipeline.
|
|
#
|
|
# Sequence:
|
|
# 1. Start the observability stack (OTel Collector, Tempo, Prometheus, Loki, Grafana)
|
|
# 2. Start a multi-node rippled cluster with full telemetry enabled
|
|
# 3. Wait for consensus
|
|
# 4. Run workload orchestrator (RPC load, TX submission, propagation wait)
|
|
# 5. Run the telemetry validation suite
|
|
# 6. Capture OTel timings and compare against committed baseline
|
|
# 7. (Optional) Run the performance overhead benchmark
|
|
#
|
|
# Usage:
|
|
# ./run-full-validation.sh --xrpld /path/to/xrpld
|
|
# ./run-full-validation.sh --xrpld /path/to/xrpld --with-benchmark
|
|
# ./run-full-validation.sh --xrpld /path/to/xrpld --skip-regression
|
|
# ./run-full-validation.sh --cleanup
|
|
#
|
|
# Exit codes:
|
|
# 0 — All validation checks and the regression gate passed
|
|
# 1 — Validation checks failed OR the regression gate detected a regression
|
|
# 2 — Infrastructure error (cluster/stack failed to start, timing capture failed)
|
|
|
|
set -euo pipefail
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Colored output helpers
|
|
# ---------------------------------------------------------------------------
|
|
log() { printf "\033[1;34m[VALIDATE]\033[0m %s\n" "$*"; }
|
|
ok() { printf "\033[1;32m[VALIDATE]\033[0m %s\n" "$*"; }
|
|
warn() { printf "\033[1;33m[VALIDATE]\033[0m %s\n" "$*"; }
|
|
fail() { printf "\033[1;31m[VALIDATE]\033[0m %s\n" "$*"; }
|
|
die() { printf "\033[1;31m[VALIDATE]\033[0m %s\n" "$*" >&2; exit 2; }
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
TELEMETRY_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
REPO_ROOT="$(cd "$TELEMETRY_DIR/../.." && pwd)"
|
|
COMPOSE_FILE="$TELEMETRY_DIR/docker-compose.workload.yaml"
|
|
WORKDIR="/tmp/xrpld-validation"
|
|
|
|
XRPLD="${XRPLD:-$REPO_ROOT/.build/xrpld}"
|
|
NUM_NODES=5
|
|
RPC_PORT_BASE=5005
|
|
WS_PORT_BASE=6006
|
|
PEER_PORT_BASE=51235
|
|
RPC_RATE=50
|
|
RPC_DURATION=120
|
|
TX_TPS=5
|
|
TX_DURATION=120
|
|
WITH_BENCHMARK=false
|
|
SKIP_LOKI=false
|
|
SKIP_REGRESSION=false
|
|
WORKLOAD_PROFILE="full-validation"
|
|
REPORT_DIR="$WORKDIR/reports"
|
|
# Rate window handed to Prometheus `rate()` when capturing timings. Keep
|
|
# this close to the active workload duration so histogram buckets cover
|
|
# the measurement window; longer windows dilute short-lived regressions.
|
|
REGRESSION_WINDOW="${REGRESSION_WINDOW:-3m}"
|
|
BASELINE_FILE="${BASELINE_FILE:-$SCRIPT_DIR/baselines/baseline-timings.json}"
|
|
THRESHOLDS_FILE="${THRESHOLDS_FILE:-$SCRIPT_DIR/regression-thresholds.json}"
|
|
METRICS_FILE="${METRICS_FILE:-$SCRIPT_DIR/regression-metrics.json}"
|
|
|
|
GENESIS_ACCOUNT="rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh"
|
|
GENESIS_SEED="snoPBrXtMeMyMHUVTgbuqAfg1SUTb"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Argument parsing
|
|
# ---------------------------------------------------------------------------
|
|
usage() {
|
|
echo "Usage: $0 [OPTIONS]"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " --xrpld PATH Path to xrpld binary"
|
|
echo " --nodes NUM Number of validator nodes (default: 5)"
|
|
echo " --rpc-rate RPS RPC load rate (default: 50)"
|
|
echo " --rpc-duration SECS RPC load duration (default: 120)"
|
|
echo " --tx-tps TPS Transaction submit rate (default: 5)"
|
|
echo " --tx-duration SECS Transaction submit duration (default: 120)"
|
|
echo " --profile NAME Workload profile (default: full-validation)"
|
|
echo " --with-benchmark Also run performance overhead benchmark (telemetry off vs on)"
|
|
echo " --skip-loki Skip Loki log-trace correlation checks"
|
|
echo " --skip-regression Skip the OTel-baseline regression gate"
|
|
echo " --cleanup Tear down everything and exit"
|
|
echo " -h, --help Show this help"
|
|
exit 0
|
|
}
|
|
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
--xrpld) XRPLD="$2"; shift 2 ;;
|
|
--nodes) NUM_NODES="$2"; shift 2 ;;
|
|
--rpc-rate) RPC_RATE="$2"; shift 2 ;;
|
|
--rpc-duration) RPC_DURATION="$2"; shift 2 ;;
|
|
--tx-tps) TX_TPS="$2"; shift 2 ;;
|
|
--tx-duration) TX_DURATION="$2"; shift 2 ;;
|
|
--profile) WORKLOAD_PROFILE="$2"; shift 2 ;;
|
|
--with-benchmark) WITH_BENCHMARK=true; shift ;;
|
|
--skip-loki) SKIP_LOKI=true; shift ;;
|
|
--skip-regression) SKIP_REGRESSION=true; shift ;;
|
|
--cleanup) # Cleanup mode
|
|
log "Cleaning up..."
|
|
pkill -f "$WORKDIR" 2>/dev/null || true
|
|
docker compose -f "$COMPOSE_FILE" down 2>/dev/null || true
|
|
rm -rf "$WORKDIR"
|
|
ok "Cleanup complete."
|
|
exit 0
|
|
;;
|
|
-h|--help) usage ;;
|
|
*) die "Unknown option: $1" ;;
|
|
esac
|
|
done
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Prerequisites
|
|
# ---------------------------------------------------------------------------
|
|
log "Checking prerequisites..."
|
|
[ -x "$XRPLD" ] || die "xrpld binary not found: $XRPLD"
|
|
command -v docker >/dev/null 2>&1 || die "docker not found"
|
|
docker compose version >/dev/null 2>&1 || die "docker compose (v2) not found"
|
|
command -v python3 >/dev/null 2>&1 || die "python3 not found"
|
|
command -v curl >/dev/null 2>&1 || die "curl not found"
|
|
command -v jq >/dev/null 2>&1 || die "jq not found"
|
|
[ -f "$COMPOSE_FILE" ] || die "docker-compose.workload.yaml not found"
|
|
|
|
# Install Python dependencies.
|
|
log "Installing Python dependencies..."
|
|
pip3 install -q -r "$SCRIPT_DIR/requirements.txt" 2>/dev/null || \
|
|
pip install -q -r "$SCRIPT_DIR/requirements.txt" 2>/dev/null || \
|
|
warn "Could not install Python dependencies — they may already be present"
|
|
|
|
ok "Prerequisites verified."
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cleanup previous run
|
|
# ---------------------------------------------------------------------------
|
|
log "Cleaning up previous run..."
|
|
pkill -f "$WORKDIR" 2>/dev/null || true
|
|
sleep 2
|
|
rm -rf "$WORKDIR"
|
|
mkdir -p "$WORKDIR" "$REPORT_DIR"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 1: Start observability stack
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 1: Starting observability stack..."
|
|
docker compose -f "$COMPOSE_FILE" up -d
|
|
|
|
log "Waiting for OTel Collector..."
|
|
for attempt in $(seq 1 30); do
|
|
status=$(curl -so /dev/null -w '%{http_code}' http://localhost:4318/ 2>/dev/null || echo 000)
|
|
if [ "$status" != "000" ]; then
|
|
ok "OTel Collector ready (attempt $attempt)"
|
|
break
|
|
fi
|
|
[ "$attempt" -eq 30 ] && die "OTel Collector not ready after 30s"
|
|
sleep 1
|
|
done
|
|
|
|
log "Waiting for Tempo..."
|
|
for attempt in $(seq 1 30); do
|
|
if curl -sf "http://localhost:3200/ready" >/dev/null 2>&1; then
|
|
ok "Tempo ready (attempt $attempt)"
|
|
break
|
|
fi
|
|
[ "$attempt" -eq 30 ] && die "Tempo not ready after 30s"
|
|
sleep 1
|
|
done
|
|
|
|
log "Waiting for Prometheus..."
|
|
for attempt in $(seq 1 30); do
|
|
if curl -sf "http://localhost:9090/-/healthy" >/dev/null 2>&1; then
|
|
ok "Prometheus ready (attempt $attempt)"
|
|
break
|
|
fi
|
|
[ "$attempt" -eq 30 ] && die "Prometheus not ready after 30s"
|
|
sleep 1
|
|
done
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 2: Generate validator keys and start cluster
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 2: Starting $NUM_NODES-node validator cluster..."
|
|
|
|
bash "$SCRIPT_DIR/generate-validator-keys.sh" "$XRPLD" "$NUM_NODES" "$WORKDIR"
|
|
|
|
for i in $(seq 1 "$NUM_NODES"); do
|
|
NODE_DIR="$WORKDIR/node$i"
|
|
mkdir -p "$NODE_DIR/nudb" "$NODE_DIR/db"
|
|
|
|
RPC_PORT=$((RPC_PORT_BASE + i - 1))
|
|
WS_PORT=$((WS_PORT_BASE + i - 1))
|
|
PEER_PORT=$((PEER_PORT_BASE + i - 1))
|
|
SEED=$(jq -r ".[$((i-1))].seed" "$WORKDIR/validator-keys.json")
|
|
|
|
# Build ips_fixed.
|
|
IPS_FIXED=""
|
|
for j in $(seq 1 "$NUM_NODES"); do
|
|
if [ "$j" -ne "$i" ]; then
|
|
IPS_FIXED="${IPS_FIXED}127.0.0.1 $((PEER_PORT_BASE + j - 1))
|
|
"
|
|
fi
|
|
done
|
|
|
|
cat > "$NODE_DIR/xrpld.cfg" <<EOCFG
|
|
[server]
|
|
port_rpc
|
|
port_ws
|
|
port_peer
|
|
|
|
[port_rpc]
|
|
port = $RPC_PORT
|
|
ip = 127.0.0.1
|
|
admin = 127.0.0.1
|
|
protocol = http
|
|
|
|
[port_ws]
|
|
port = $WS_PORT
|
|
ip = 127.0.0.1
|
|
admin = 127.0.0.1
|
|
protocol = ws
|
|
|
|
[port_peer]
|
|
port = $PEER_PORT
|
|
ip = 0.0.0.0
|
|
protocol = peer
|
|
|
|
[node_db]
|
|
type=NuDB
|
|
path=$NODE_DIR/nudb
|
|
online_delete=256
|
|
|
|
[database_path]
|
|
$NODE_DIR/db
|
|
|
|
[debug_logfile]
|
|
$NODE_DIR/debug.log
|
|
|
|
[validation_seed]
|
|
$SEED
|
|
|
|
[validators_file]
|
|
$WORKDIR/validators.txt
|
|
|
|
[ips]
|
|
${IPS_FIXED}
|
|
|
|
[telemetry]
|
|
enabled=1
|
|
service_instance_id=validator-${i}
|
|
endpoint=http://localhost:4318/v1/traces
|
|
exporter=otlp_http
|
|
sampling_ratio=1.0
|
|
batch_size=512
|
|
batch_delay_ms=2000
|
|
max_queue_size=2048
|
|
trace_rpc=1
|
|
trace_transactions=1
|
|
trace_consensus=1
|
|
trace_peer=1
|
|
trace_ledger=1
|
|
|
|
[insight]
|
|
server=statsd
|
|
address=127.0.0.1:8125
|
|
prefix=rippled
|
|
|
|
[rpc_startup]
|
|
{ "command": "log_level", "severity": "warning" }
|
|
|
|
[signing_support]
|
|
true
|
|
|
|
[ssl_verify]
|
|
0
|
|
EOCFG
|
|
|
|
"$XRPLD" --conf "$NODE_DIR/xrpld.cfg" --start > "$NODE_DIR/stdout.log" 2>&1 &
|
|
echo $! > "$NODE_DIR/xrpld.pid"
|
|
log " Node $i: RPC=$RPC_PORT WS=$WS_PORT Peer=$PEER_PORT PID=$!"
|
|
done
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 3: Wait for consensus
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 3: Waiting for consensus..."
|
|
for attempt in $(seq 1 120); do
|
|
ready=0
|
|
for i in $(seq 1 "$NUM_NODES"); do
|
|
port=$((RPC_PORT_BASE + i - 1))
|
|
state=$(curl -sf "http://localhost:$port" \
|
|
-d '{"method":"server_info"}' 2>/dev/null \
|
|
| jq -r '.result.info.server_state' 2>/dev/null || echo "")
|
|
if [ "$state" = "proposing" ]; then
|
|
ready=$((ready + 1))
|
|
fi
|
|
done
|
|
if [ "$ready" -ge "$NUM_NODES" ]; then
|
|
ok "All $NUM_NODES nodes proposing (attempt $attempt)"
|
|
break
|
|
fi
|
|
if [ "$attempt" -eq 120 ]; then
|
|
warn "Consensus timeout — $ready/$NUM_NODES nodes ready"
|
|
fi
|
|
printf "\r %d/%d nodes proposing..." "$ready" "$NUM_NODES"
|
|
sleep 1
|
|
done
|
|
echo ""
|
|
|
|
# Wait for first validated ledger.
|
|
log "Waiting for validated ledger..."
|
|
for attempt in $(seq 1 60); do
|
|
val_seq=$(curl -sf "http://localhost:$RPC_PORT_BASE" \
|
|
-d '{"method":"server_info"}' 2>/dev/null \
|
|
| jq -r '.result.info.validated_ledger.seq // 0' 2>/dev/null || echo 0)
|
|
if [ "$val_seq" -gt 2 ] 2>/dev/null; then
|
|
ok "Validated ledger: seq $val_seq"
|
|
break
|
|
fi
|
|
[ "$attempt" -eq 60 ] && warn "No validated ledger after 60s"
|
|
sleep 1
|
|
done
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 4: Run workload orchestrator
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 4: Running workload orchestrator (profile: $WORKLOAD_PROFILE)..."
|
|
|
|
WS_ENDPOINTS=""
|
|
for i in $(seq 1 "$NUM_NODES"); do
|
|
WS_ENDPOINTS="$WS_ENDPOINTS ws://localhost:$((WS_PORT_BASE + i - 1))"
|
|
done
|
|
|
|
python3 "$SCRIPT_DIR/workload_orchestrator.py" \
|
|
--profile "$WORKLOAD_PROFILE" \
|
|
--endpoints $WS_ENDPOINTS \
|
|
--report "$REPORT_DIR/workload-report.json" \
|
|
--report-dir "$REPORT_DIR" || \
|
|
warn "Workload orchestrator returned non-zero exit"
|
|
|
|
ok "Workload orchestration complete."
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 5: Run telemetry validation suite
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 5: Running telemetry validation suite..."
|
|
|
|
VALIDATION_ARGS="--report $REPORT_DIR/validation-report.json"
|
|
if [ "$SKIP_LOKI" = true ]; then
|
|
VALIDATION_ARGS="$VALIDATION_ARGS --skip-loki"
|
|
fi
|
|
|
|
VALIDATION_EXIT=0
|
|
python3 "$SCRIPT_DIR/validate_telemetry.py" $VALIDATION_ARGS || VALIDATION_EXIT=$?
|
|
|
|
if [ "$VALIDATION_EXIT" -eq 0 ]; then
|
|
ok "All telemetry validation checks passed!"
|
|
else
|
|
fail "Some telemetry validation checks failed (exit $VALIDATION_EXIT)"
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 6: Capture OTel timings and run the regression comparison
|
|
# ---------------------------------------------------------------------------
|
|
# This step ALWAYS captures timings (so CI always has an artifact from which
|
|
# to bootstrap/refresh the committed baseline). The comparator then either:
|
|
# - prints the paste-me JSON when the baseline is a placeholder, or
|
|
# - enforces thresholds and fails the run on regression.
|
|
# Use --skip-regression to opt out (e.g. for ad-hoc local exploration).
|
|
TIMINGS_FILE="$REPORT_DIR/timings.json"
|
|
REGRESSION_REPORT="$REPORT_DIR/regression-report.json"
|
|
REGRESSION_EXIT=0
|
|
|
|
if [ "$SKIP_REGRESSION" != true ]; then
|
|
log "Step 6: Capturing OTel timings from Prometheus..."
|
|
if python3 "$SCRIPT_DIR/capture_timings.py" \
|
|
--prometheus "http://localhost:9090" \
|
|
--metrics "$METRICS_FILE" \
|
|
--output "$TIMINGS_FILE" \
|
|
--window "$REGRESSION_WINDOW" \
|
|
--profile "$WORKLOAD_PROFILE"
|
|
then
|
|
ok "Timings captured: $TIMINGS_FILE"
|
|
else
|
|
fail "Failed to capture timings — skipping regression comparison."
|
|
REGRESSION_EXIT=2
|
|
SKIP_REGRESSION=true
|
|
fi
|
|
fi
|
|
|
|
if [ "$SKIP_REGRESSION" != true ]; then
|
|
log "Comparing against baseline $BASELINE_FILE..."
|
|
python3 "$SCRIPT_DIR/compare_to_baseline.py" \
|
|
--timings "$TIMINGS_FILE" \
|
|
--baseline "$BASELINE_FILE" \
|
|
--thresholds "$THRESHOLDS_FILE" \
|
|
--report "$REGRESSION_REPORT" || REGRESSION_EXIT=$?
|
|
if [ "$REGRESSION_EXIT" -eq 0 ]; then
|
|
ok "Regression gate passed (or baseline placeholder — paste JSON printed above)."
|
|
elif [ "$REGRESSION_EXIT" -eq 1 ]; then
|
|
fail "Regression detected — see $REGRESSION_REPORT"
|
|
else
|
|
fail "Regression comparator internal error (exit $REGRESSION_EXIT)"
|
|
fi
|
|
else
|
|
warn "Regression gate skipped."
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 7: (Optional) Run overhead benchmark
|
|
# ---------------------------------------------------------------------------
|
|
if [ "$WITH_BENCHMARK" = true ]; then
|
|
log "Step 7: Running performance benchmark..."
|
|
bash "$SCRIPT_DIR/benchmark.sh" \
|
|
--xrpld "$XRPLD" \
|
|
--duration 120 \
|
|
--nodes 3 \
|
|
--output "$REPORT_DIR" || \
|
|
warn "Benchmark returned non-zero exit"
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Summary
|
|
# ---------------------------------------------------------------------------
|
|
echo ""
|
|
echo "==========================================================="
|
|
echo " FULL VALIDATION RESULTS"
|
|
echo "==========================================================="
|
|
echo ""
|
|
echo " Reports directory: $REPORT_DIR"
|
|
echo ""
|
|
ls -la "$REPORT_DIR/" 2>/dev/null || true
|
|
echo ""
|
|
echo " Observability stack is running:"
|
|
echo " Tempo: http://localhost:3200"
|
|
echo " Grafana: http://localhost:3000"
|
|
echo " Prometheus: http://localhost:9090"
|
|
echo ""
|
|
echo " xrpld nodes ($NUM_NODES) are running:"
|
|
for i in $(seq 1 "$NUM_NODES"); do
|
|
rpc=$((RPC_PORT_BASE + i - 1))
|
|
ws=$((WS_PORT_BASE + i - 1))
|
|
pid=$(cat "$WORKDIR/node$i/xrpld.pid" 2>/dev/null || echo 'unknown')
|
|
echo " Node $i: RPC=$rpc WS=$ws PID=$pid"
|
|
done
|
|
echo ""
|
|
echo " To tear down:"
|
|
echo " $0 --cleanup"
|
|
echo ""
|
|
echo "==========================================================="
|
|
|
|
# Fail the run if EITHER validation or the regression gate failed. The
|
|
# `[ "$VAR" -gt N ]` comparison works here because exit codes are numeric.
|
|
FINAL_EXIT=0
|
|
if [ "$VALIDATION_EXIT" -ne 0 ]; then
|
|
FINAL_EXIT="$VALIDATION_EXIT"
|
|
fi
|
|
if [ "$REGRESSION_EXIT" -ne 0 ] && [ "$FINAL_EXIT" -eq 0 ]; then
|
|
FINAL_EXIT="$REGRESSION_EXIT"
|
|
fi
|
|
exit "$FINAL_EXIT"
|