mirror of
https://github.com/XRPLF/rippled.git
synced 2026-04-29 15:37:57 +00:00
Add comprehensive workload harness for end-to-end validation of the
Phases 1-9 telemetry stack:
Task 10.1 — Multi-node test harness:
- docker-compose.workload.yaml with full OTel stack (Collector, Jaeger,
Tempo, Prometheus, Loki, Grafana)
- generate-validator-keys.sh for automated key generation
- xrpld-validator.cfg.template for node configuration
Task 10.2 — RPC load generator:
- rpc_load_generator.py with WebSocket client, configurable rates,
realistic command distribution (40% health, 30% wallet, 15% explorer,
10% tx lookups, 5% DEX), W3C traceparent injection
Task 10.3 — Transaction submitter:
- tx_submitter.py with 10 transaction types (Payment, OfferCreate,
OfferCancel, TrustSet, NFTokenMint, NFTokenCreateOffer, EscrowCreate,
EscrowFinish, AMMCreate, AMMDeposit), auto-funded test accounts
Task 10.4 — Telemetry validation suite:
- validate_telemetry.py checking spans (Jaeger), metrics (Prometheus),
log-trace correlation (Loki), dashboards (Grafana)
- expected_spans.json (17 span types, 22 attributes, 3 hierarchies)
- expected_metrics.json (SpanMetrics, StatsD, Phase 9, dashboards)
Task 10.5 — Performance benchmark suite:
- benchmark.sh for baseline vs telemetry comparison
- collect_system_metrics.sh for CPU/memory/latency sampling
- Thresholds: <3% CPU, <5MB memory, <2ms RPC p99, <5% TPS, <1% consensus
Task 10.6 — CI integration:
- telemetry-validation.yml GitHub Actions workflow
- run-full-validation.sh orchestrator script
- Manual trigger + telemetry branch auto-trigger
Task 10.7 — Documentation:
- workload/README.md with quick start and tool reference
- Updated telemetry-runbook.md with validation and benchmark sections
- Updated 09-data-collection-reference.md with validation inventory
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
414 lines
13 KiB
Bash
Executable File
414 lines
13 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# run-full-validation.sh — Orchestrates the full telemetry validation pipeline.
|
|
#
|
|
# Sequence:
|
|
# 1. Start the observability stack (OTel Collector, Jaeger, Tempo, Prometheus, Loki, Grafana)
|
|
# 2. Start a multi-node rippled cluster with full telemetry enabled
|
|
# 3. Wait for consensus
|
|
# 4. Run the RPC load generator
|
|
# 5. Run the transaction submitter
|
|
# 6. Wait for telemetry data to propagate
|
|
# 7. Run the telemetry validation suite
|
|
# 8. (Optional) Run the performance benchmark
|
|
#
|
|
# Usage:
|
|
# ./run-full-validation.sh --xrpld /path/to/xrpld
|
|
# ./run-full-validation.sh --xrpld /path/to/xrpld --with-benchmark
|
|
# ./run-full-validation.sh --cleanup
|
|
#
|
|
# Exit codes:
|
|
# 0 — All validation checks passed
|
|
# 1 — One or more validation checks failed
|
|
# 2 — Infrastructure error (cluster/stack failed to start)
|
|
|
|
set -euo pipefail
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Colored output helpers
|
|
# ---------------------------------------------------------------------------
|
|
log() { printf "\033[1;34m[VALIDATE]\033[0m %s\n" "$*"; }
|
|
ok() { printf "\033[1;32m[VALIDATE]\033[0m %s\n" "$*"; }
|
|
warn() { printf "\033[1;33m[VALIDATE]\033[0m %s\n" "$*"; }
|
|
fail() { printf "\033[1;31m[VALIDATE]\033[0m %s\n" "$*"; }
|
|
die() { printf "\033[1;31m[VALIDATE]\033[0m %s\n" "$*" >&2; exit 2; }
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
TELEMETRY_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
REPO_ROOT="$(cd "$TELEMETRY_DIR/../.." && pwd)"
|
|
COMPOSE_FILE="$TELEMETRY_DIR/docker-compose.workload.yaml"
|
|
WORKDIR="/tmp/xrpld-validation"
|
|
|
|
XRPLD="${XRPLD:-$REPO_ROOT/.build/xrpld}"
|
|
NUM_NODES=5
|
|
RPC_PORT_BASE=5005
|
|
WS_PORT_BASE=6006
|
|
PEER_PORT_BASE=51235
|
|
RPC_RATE=50
|
|
RPC_DURATION=120
|
|
TX_TPS=5
|
|
TX_DURATION=120
|
|
WITH_BENCHMARK=false
|
|
SKIP_LOKI=false
|
|
REPORT_DIR="$WORKDIR/reports"
|
|
|
|
GENESIS_ACCOUNT="rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh"
|
|
GENESIS_SEED="snoPBrXtMeMyMHUVTgbuqAfg1SUTb"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Argument parsing
|
|
# ---------------------------------------------------------------------------
|
|
usage() {
|
|
echo "Usage: $0 [OPTIONS]"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " --xrpld PATH Path to xrpld binary"
|
|
echo " --nodes NUM Number of validator nodes (default: 5)"
|
|
echo " --rpc-rate RPS RPC load rate (default: 50)"
|
|
echo " --rpc-duration SECS RPC load duration (default: 120)"
|
|
echo " --tx-tps TPS Transaction submit rate (default: 5)"
|
|
echo " --tx-duration SECS Transaction submit duration (default: 120)"
|
|
echo " --with-benchmark Also run performance benchmarks"
|
|
echo " --skip-loki Skip Loki log-trace correlation checks"
|
|
echo " --cleanup Tear down everything and exit"
|
|
echo " -h, --help Show this help"
|
|
exit 0
|
|
}
|
|
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
--xrpld) XRPLD="$2"; shift 2 ;;
|
|
--nodes) NUM_NODES="$2"; shift 2 ;;
|
|
--rpc-rate) RPC_RATE="$2"; shift 2 ;;
|
|
--rpc-duration) RPC_DURATION="$2"; shift 2 ;;
|
|
--tx-tps) TX_TPS="$2"; shift 2 ;;
|
|
--tx-duration) TX_DURATION="$2"; shift 2 ;;
|
|
--with-benchmark) WITH_BENCHMARK=true; shift ;;
|
|
--skip-loki) SKIP_LOKI=true; shift ;;
|
|
--cleanup) # Cleanup mode
|
|
log "Cleaning up..."
|
|
pkill -f "$WORKDIR" 2>/dev/null || true
|
|
docker compose -f "$COMPOSE_FILE" down 2>/dev/null || true
|
|
rm -rf "$WORKDIR"
|
|
ok "Cleanup complete."
|
|
exit 0
|
|
;;
|
|
-h|--help) usage ;;
|
|
*) die "Unknown option: $1" ;;
|
|
esac
|
|
done
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Prerequisites
|
|
# ---------------------------------------------------------------------------
|
|
log "Checking prerequisites..."
|
|
[ -x "$XRPLD" ] || die "xrpld binary not found: $XRPLD"
|
|
command -v docker >/dev/null 2>&1 || die "docker not found"
|
|
docker compose version >/dev/null 2>&1 || die "docker compose (v2) not found"
|
|
command -v python3 >/dev/null 2>&1 || die "python3 not found"
|
|
command -v curl >/dev/null 2>&1 || die "curl not found"
|
|
command -v jq >/dev/null 2>&1 || die "jq not found"
|
|
[ -f "$COMPOSE_FILE" ] || die "docker-compose.workload.yaml not found"
|
|
|
|
# Install Python dependencies.
|
|
log "Installing Python dependencies..."
|
|
pip3 install -q -r "$SCRIPT_DIR/requirements.txt" 2>/dev/null || \
|
|
pip install -q -r "$SCRIPT_DIR/requirements.txt" 2>/dev/null || \
|
|
warn "Could not install Python dependencies — they may already be present"
|
|
|
|
ok "Prerequisites verified."
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cleanup previous run
|
|
# ---------------------------------------------------------------------------
|
|
log "Cleaning up previous run..."
|
|
pkill -f "$WORKDIR" 2>/dev/null || true
|
|
sleep 2
|
|
rm -rf "$WORKDIR"
|
|
mkdir -p "$WORKDIR" "$REPORT_DIR"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 1: Start observability stack
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 1: Starting observability stack..."
|
|
docker compose -f "$COMPOSE_FILE" up -d
|
|
|
|
log "Waiting for OTel Collector..."
|
|
for attempt in $(seq 1 30); do
|
|
status=$(curl -so /dev/null -w '%{http_code}' http://localhost:4318/ 2>/dev/null || echo 000)
|
|
if [ "$status" != "000" ]; then
|
|
ok "OTel Collector ready (attempt $attempt)"
|
|
break
|
|
fi
|
|
[ "$attempt" -eq 30 ] && die "OTel Collector not ready after 30s"
|
|
sleep 1
|
|
done
|
|
|
|
log "Waiting for Jaeger..."
|
|
for attempt in $(seq 1 30); do
|
|
if curl -sf "http://localhost:16686/" >/dev/null 2>&1; then
|
|
ok "Jaeger ready (attempt $attempt)"
|
|
break
|
|
fi
|
|
[ "$attempt" -eq 30 ] && die "Jaeger not ready after 30s"
|
|
sleep 1
|
|
done
|
|
|
|
log "Waiting for Prometheus..."
|
|
for attempt in $(seq 1 30); do
|
|
if curl -sf "http://localhost:9090/-/healthy" >/dev/null 2>&1; then
|
|
ok "Prometheus ready (attempt $attempt)"
|
|
break
|
|
fi
|
|
[ "$attempt" -eq 30 ] && die "Prometheus not ready after 30s"
|
|
sleep 1
|
|
done
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 2: Generate validator keys and start cluster
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 2: Starting $NUM_NODES-node validator cluster..."
|
|
|
|
bash "$SCRIPT_DIR/generate-validator-keys.sh" "$XRPLD" "$NUM_NODES" "$WORKDIR"
|
|
|
|
for i in $(seq 1 "$NUM_NODES"); do
|
|
NODE_DIR="$WORKDIR/node$i"
|
|
mkdir -p "$NODE_DIR/nudb" "$NODE_DIR/db"
|
|
|
|
RPC_PORT=$((RPC_PORT_BASE + i - 1))
|
|
WS_PORT=$((WS_PORT_BASE + i - 1))
|
|
PEER_PORT=$((PEER_PORT_BASE + i - 1))
|
|
SEED=$(jq -r ".[$((i-1))].seed" "$WORKDIR/validator-keys.json")
|
|
|
|
# Build ips_fixed.
|
|
IPS_FIXED=""
|
|
for j in $(seq 1 "$NUM_NODES"); do
|
|
if [ "$j" -ne "$i" ]; then
|
|
IPS_FIXED="${IPS_FIXED}127.0.0.1 $((PEER_PORT_BASE + j - 1))
|
|
"
|
|
fi
|
|
done
|
|
|
|
cat > "$NODE_DIR/xrpld.cfg" <<EOCFG
|
|
[server]
|
|
port_rpc
|
|
port_ws
|
|
port_peer
|
|
|
|
[port_rpc]
|
|
port = $RPC_PORT
|
|
ip = 127.0.0.1
|
|
admin = 127.0.0.1
|
|
protocol = http
|
|
|
|
[port_ws]
|
|
port = $WS_PORT
|
|
ip = 127.0.0.1
|
|
admin = 127.0.0.1
|
|
protocol = ws
|
|
|
|
[port_peer]
|
|
port = $PEER_PORT
|
|
ip = 0.0.0.0
|
|
protocol = peer
|
|
|
|
[node_db]
|
|
type=NuDB
|
|
path=$NODE_DIR/nudb
|
|
online_delete=256
|
|
|
|
[database_path]
|
|
$NODE_DIR/db
|
|
|
|
[debug_logfile]
|
|
$NODE_DIR/debug.log
|
|
|
|
[validation_seed]
|
|
$SEED
|
|
|
|
[validators_file]
|
|
$WORKDIR/validators.txt
|
|
|
|
[ips_fixed]
|
|
${IPS_FIXED}
|
|
[peer_private]
|
|
1
|
|
|
|
[telemetry]
|
|
enabled=1
|
|
service_instance_id=validator-${i}
|
|
endpoint=http://localhost:4318/v1/traces
|
|
exporter=otlp_http
|
|
sampling_ratio=1.0
|
|
batch_size=512
|
|
batch_delay_ms=2000
|
|
max_queue_size=2048
|
|
trace_rpc=1
|
|
trace_transactions=1
|
|
trace_consensus=1
|
|
trace_peer=1
|
|
trace_ledger=1
|
|
|
|
[insight]
|
|
server=statsd
|
|
address=127.0.0.1:8125
|
|
prefix=rippled
|
|
|
|
[rpc_startup]
|
|
{ "command": "log_level", "severity": "warning" }
|
|
|
|
[ssl_verify]
|
|
0
|
|
EOCFG
|
|
|
|
"$XRPLD" --conf "$NODE_DIR/xrpld.cfg" --start > "$NODE_DIR/stdout.log" 2>&1 &
|
|
echo $! > "$NODE_DIR/xrpld.pid"
|
|
log " Node $i: RPC=$RPC_PORT WS=$WS_PORT Peer=$PEER_PORT PID=$!"
|
|
done
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 3: Wait for consensus
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 3: Waiting for consensus..."
|
|
for attempt in $(seq 1 120); do
|
|
ready=0
|
|
for i in $(seq 1 "$NUM_NODES"); do
|
|
port=$((RPC_PORT_BASE + i - 1))
|
|
state=$(curl -sf "http://localhost:$port" \
|
|
-d '{"method":"server_info"}' 2>/dev/null \
|
|
| jq -r '.result.info.server_state' 2>/dev/null || echo "")
|
|
if [ "$state" = "proposing" ]; then
|
|
ready=$((ready + 1))
|
|
fi
|
|
done
|
|
if [ "$ready" -ge "$NUM_NODES" ]; then
|
|
ok "All $NUM_NODES nodes proposing (attempt $attempt)"
|
|
break
|
|
fi
|
|
if [ "$attempt" -eq 120 ]; then
|
|
warn "Consensus timeout — $ready/$NUM_NODES nodes ready"
|
|
fi
|
|
printf "\r %d/%d nodes proposing..." "$ready" "$NUM_NODES"
|
|
sleep 1
|
|
done
|
|
echo ""
|
|
|
|
# Wait for first validated ledger.
|
|
log "Waiting for validated ledger..."
|
|
for attempt in $(seq 1 60); do
|
|
val_seq=$(curl -sf "http://localhost:$RPC_PORT_BASE" \
|
|
-d '{"method":"server_info"}' 2>/dev/null \
|
|
| jq -r '.result.info.validated_ledger.seq // 0' 2>/dev/null || echo 0)
|
|
if [ "$val_seq" -gt 2 ] 2>/dev/null; then
|
|
ok "Validated ledger: seq $val_seq"
|
|
break
|
|
fi
|
|
[ "$attempt" -eq 60 ] && warn "No validated ledger after 60s"
|
|
sleep 1
|
|
done
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 4: Run RPC load generator
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 4: Running RPC load generator (${RPC_RATE} RPS for ${RPC_DURATION}s)..."
|
|
|
|
WS_ENDPOINTS=""
|
|
for i in $(seq 1 "$NUM_NODES"); do
|
|
WS_ENDPOINTS="$WS_ENDPOINTS ws://localhost:$((WS_PORT_BASE + i - 1))"
|
|
done
|
|
|
|
python3 "$SCRIPT_DIR/rpc_load_generator.py" \
|
|
--endpoints $WS_ENDPOINTS \
|
|
--rate "$RPC_RATE" \
|
|
--duration "$RPC_DURATION" \
|
|
--output "$REPORT_DIR/rpc-load-results.json" || \
|
|
warn "RPC load generator returned non-zero exit"
|
|
|
|
ok "RPC load generation complete."
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 5: Run transaction submitter
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 5: Running transaction submitter (${TX_TPS} TPS for ${TX_DURATION}s)..."
|
|
|
|
python3 "$SCRIPT_DIR/tx_submitter.py" \
|
|
--endpoint "ws://localhost:$WS_PORT_BASE" \
|
|
--tps "$TX_TPS" \
|
|
--duration "$TX_DURATION" \
|
|
--output "$REPORT_DIR/tx-submit-results.json" || \
|
|
warn "Transaction submitter returned non-zero exit"
|
|
|
|
ok "Transaction submission complete."
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 6: Wait for telemetry propagation
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 6: Waiting 30s for telemetry data to propagate..."
|
|
sleep 30
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 7: Run telemetry validation suite
|
|
# ---------------------------------------------------------------------------
|
|
log "Step 7: Running telemetry validation suite..."
|
|
|
|
VALIDATION_ARGS="--report $REPORT_DIR/validation-report.json"
|
|
if [ "$SKIP_LOKI" = true ]; then
|
|
VALIDATION_ARGS="$VALIDATION_ARGS --skip-loki"
|
|
fi
|
|
|
|
VALIDATION_EXIT=0
|
|
python3 "$SCRIPT_DIR/validate_telemetry.py" $VALIDATION_ARGS || VALIDATION_EXIT=$?
|
|
|
|
if [ "$VALIDATION_EXIT" -eq 0 ]; then
|
|
ok "All telemetry validation checks passed!"
|
|
else
|
|
fail "Some telemetry validation checks failed (exit $VALIDATION_EXIT)"
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step 8: (Optional) Run benchmark
|
|
# ---------------------------------------------------------------------------
|
|
if [ "$WITH_BENCHMARK" = true ]; then
|
|
log "Step 8: Running performance benchmark..."
|
|
bash "$SCRIPT_DIR/benchmark.sh" \
|
|
--xrpld "$XRPLD" \
|
|
--duration 120 \
|
|
--nodes 3 \
|
|
--output "$REPORT_DIR" || \
|
|
warn "Benchmark returned non-zero exit"
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Summary
|
|
# ---------------------------------------------------------------------------
|
|
echo ""
|
|
echo "==========================================================="
|
|
echo " FULL VALIDATION RESULTS"
|
|
echo "==========================================================="
|
|
echo ""
|
|
echo " Reports directory: $REPORT_DIR"
|
|
echo ""
|
|
ls -la "$REPORT_DIR/" 2>/dev/null || true
|
|
echo ""
|
|
echo " Observability stack is running:"
|
|
echo " Jaeger UI: http://localhost:16686"
|
|
echo " Grafana: http://localhost:3000"
|
|
echo " Prometheus: http://localhost:9090"
|
|
echo ""
|
|
echo " xrpld nodes ($NUM_NODES) are running:"
|
|
for i in $(seq 1 "$NUM_NODES"); do
|
|
rpc=$((RPC_PORT_BASE + i - 1))
|
|
ws=$((WS_PORT_BASE + i - 1))
|
|
pid=$(cat "$WORKDIR/node$i/xrpld.pid" 2>/dev/null || echo 'unknown')
|
|
echo " Node $i: RPC=$rpc WS=$ws PID=$pid"
|
|
done
|
|
echo ""
|
|
echo " To tear down:"
|
|
echo " $0 --cleanup"
|
|
echo ""
|
|
echo "==========================================================="
|
|
|
|
exit "$VALIDATION_EXIT"
|