mirror of
https://github.com/XRPLF/rippled.git
synced 2026-04-29 15:37:57 +00:00
- capture_timings.py: fail when captured/total ratio < 50% (--min-capture-ratio). Prevents silent pass on unreachable Prometheus. - run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so the final exit code reflects it. Update exit code docs in header. - compare_to_baseline.py: extract _skip_delta helper to bring compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound resolution (use explicit None check instead of `or`). Remove dead variable override_prefix_key. - prom_queries.py: extract _build_simple_entries and _build_job_entries to bring build_query_plan under 80 lines. Fix module docstring return type example. Use aiohttp.ClientTimeout instead of bare int. - telemetry-validation.yml: add set -euo pipefail to regression summary step; guard jq calls with -e flag and fallback; fail on missing baseline file; emit ::warning annotation when timings.json missing. - baselines/README.md: document the placeholder field.
306 lines
12 KiB
YAML
306 lines
12 KiB
YAML
# Telemetry Validation CI Workflow
|
|
#
|
|
# Builds rippled with telemetry enabled, runs the multi-node workload
|
|
# harness, validates all telemetry data, and runs performance benchmarks.
|
|
#
|
|
# This is a separate workflow from the main CI. It runs:
|
|
# - On manual dispatch (workflow_dispatch)
|
|
# - On pushes to telemetry-related branches
|
|
#
|
|
# The workflow is intentionally heavyweight (builds rippled, starts Docker
|
|
# services, runs a multi-node cluster) — it validates the full telemetry
|
|
# stack end-to-end rather than individual unit tests.
|
|
#
|
|
# Architecture: two jobs to leverage cached dependencies:
|
|
# 1. build-xrpld — runs on a self-hosted runner inside the same container
|
|
# image the main CI uses (debian-bookworm-gcc-13). This ensures Conan
|
|
# packages are fetched from the XRPLF remote instead of built from
|
|
# source, and ccache hits the remote cache.
|
|
# 2. validate-telemetry — runs on ubuntu-latest (which has Docker) to
|
|
# launch the telemetry stack (OTel collector, Prometheus, Tempo, etc.)
|
|
# and validate the full pipeline end-to-end.
|
|
|
|
name: Telemetry Validation
|
|
|
|
on:
|
|
workflow_dispatch:
|
|
inputs:
|
|
rpc_rate:
|
|
description: "RPC load rate (requests per second)"
|
|
required: false
|
|
default: "50"
|
|
rpc_duration:
|
|
description: "RPC load duration (seconds)"
|
|
required: false
|
|
default: "120"
|
|
tx_tps:
|
|
description: "Transaction submit rate (TPS)"
|
|
required: false
|
|
default: "5"
|
|
tx_duration:
|
|
description: "Transaction submit duration (seconds)"
|
|
required: false
|
|
default: "120"
|
|
run_benchmark:
|
|
description: "Run performance benchmarks"
|
|
required: false
|
|
type: boolean
|
|
default: false
|
|
|
|
push:
|
|
branches:
|
|
- "pratik/otel-phase*"
|
|
- "feature/otel-*"
|
|
- "feature/telemetry-*"
|
|
paths:
|
|
- ".github/workflows/telemetry-validation.yml"
|
|
- "docker/telemetry/**"
|
|
- "include/xrpl/basics/Telemetry*.h"
|
|
- "src/xrpld/app/misc/Telemetry*"
|
|
|
|
concurrency:
|
|
group: telemetry-validation-${{ github.ref }}
|
|
cancel-in-progress: true
|
|
|
|
defaults:
|
|
run:
|
|
shell: bash
|
|
|
|
env:
|
|
BUILD_DIR: build
|
|
|
|
jobs:
|
|
# ── Job 1: Build xrpld in the same container the main CI uses ──────
|
|
# This ensures Conan binary packages are fetched from the XRPLF remote
|
|
# (matching package IDs) and ccache hits the remote compilation cache.
|
|
build-xrpld:
|
|
name: Build xrpld
|
|
runs-on: [self-hosted, Linux, X64, heavy]
|
|
container: ghcr.io/xrplf/ci/debian-bookworm:gcc-13-sha-ab4d1f0
|
|
timeout-minutes: 60
|
|
env:
|
|
CCACHE_NAMESPACE: telemetry-validation
|
|
CCACHE_REMOTE_ONLY: true
|
|
CCACHE_REMOTE_STORAGE: http://cache.dev.ripplex.io:8080|layout=bazel
|
|
CCACHE_SLOPPINESS: include_file_ctime,include_file_mtime
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
|
|
|
- name: Prepare runner
|
|
uses: XRPLF/actions/prepare-runner@2cbf481018d930656e9276fcc20dc0e3a0be5b6d
|
|
with:
|
|
enable_ccache: ${{ github.repository_owner == 'XRPLF' }}
|
|
|
|
- name: Print build environment
|
|
uses: ./.github/actions/print-env
|
|
|
|
- name: Get number of processors
|
|
uses: XRPLF/actions/get-nproc@cf0433aa74563aead044a1e395610c96d65a37cf
|
|
id: nproc
|
|
with:
|
|
subtract: 2
|
|
|
|
- name: Setup Conan
|
|
uses: ./.github/actions/setup-conan
|
|
|
|
- name: Build dependencies
|
|
uses: ./.github/actions/build-deps
|
|
with:
|
|
build_nproc: ${{ steps.nproc.outputs.nproc }}
|
|
build_type: Release
|
|
log_verbosity: verbose
|
|
|
|
- name: Configure CMake
|
|
working-directory: ${{ env.BUILD_DIR }}
|
|
run: |
|
|
cmake \
|
|
-G Ninja \
|
|
-DCMAKE_TOOLCHAIN_FILE:FILEPATH=build/generators/conan_toolchain.cmake \
|
|
-DCMAKE_BUILD_TYPE=Release \
|
|
..
|
|
|
|
- name: Build xrpld
|
|
working-directory: ${{ env.BUILD_DIR }}
|
|
env:
|
|
BUILD_NPROC: ${{ steps.nproc.outputs.nproc }}
|
|
run: |
|
|
cmake \
|
|
--build . \
|
|
--config Release \
|
|
--parallel "${BUILD_NPROC}" \
|
|
--target xrpld
|
|
|
|
- name: Show ccache statistics
|
|
if: ${{ github.repository_owner == 'XRPLF' }}
|
|
run: ccache --show-stats -vv
|
|
|
|
- name: Upload xrpld binary
|
|
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
|
|
with:
|
|
name: xrpld-telemetry
|
|
path: ${{ env.BUILD_DIR }}/xrpld
|
|
retention-days: 1
|
|
if-no-files-found: error
|
|
|
|
# ── Job 2: Run telemetry validation on ubuntu-latest (has Docker) ──
|
|
validate-telemetry:
|
|
name: Telemetry Stack Validation
|
|
needs: build-xrpld
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 30
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
|
|
|
- name: Install Python dependencies
|
|
run: pip3 install -r docker/telemetry/workload/requirements.txt
|
|
|
|
- name: Download xrpld binary
|
|
uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
|
|
with:
|
|
name: xrpld-telemetry
|
|
path: ${{ env.BUILD_DIR }}
|
|
|
|
- name: Make binaries and scripts executable
|
|
run: |
|
|
chmod +x ${{ env.BUILD_DIR }}/xrpld
|
|
chmod +x docker/telemetry/workload/*.sh
|
|
|
|
- name: Run full telemetry validation
|
|
id: validation
|
|
env:
|
|
RPC_RATE: ${{ github.event.inputs.rpc_rate || '50' }}
|
|
RPC_DURATION: ${{ github.event.inputs.rpc_duration || '120' }}
|
|
TX_TPS: ${{ github.event.inputs.tx_tps || '5' }}
|
|
TX_DURATION: ${{ github.event.inputs.tx_duration || '120' }}
|
|
RUN_BENCHMARK: ${{ github.event.inputs.run_benchmark }}
|
|
run: |
|
|
ARGS="--xrpld ${{ env.BUILD_DIR }}/xrpld --skip-loki"
|
|
ARGS="$ARGS --rpc-rate $RPC_RATE"
|
|
ARGS="$ARGS --rpc-duration $RPC_DURATION"
|
|
ARGS="$ARGS --tx-tps $TX_TPS"
|
|
ARGS="$ARGS --tx-duration $TX_DURATION"
|
|
if [ "$RUN_BENCHMARK" = "true" ]; then
|
|
ARGS="$ARGS --with-benchmark"
|
|
fi
|
|
docker/telemetry/workload/run-full-validation.sh $ARGS
|
|
# continue-on-error allows subsequent steps (artifact upload,
|
|
# summary printing) to run even if validation fails. The final
|
|
# "Check validation result" step re-checks steps.validation.outcome
|
|
# (the pre-continue-on-error result) and fails the job properly.
|
|
continue-on-error: true
|
|
|
|
- name: Upload validation reports
|
|
if: always()
|
|
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
|
|
with:
|
|
name: telemetry-validation-reports
|
|
path: /tmp/xrpld-validation/reports/
|
|
retention-days: 30
|
|
|
|
- name: Upload node logs
|
|
if: failure()
|
|
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
|
|
with:
|
|
name: xrpld-node-logs
|
|
path: /tmp/xrpld-validation/node*/debug.log
|
|
retention-days: 7
|
|
|
|
- name: Print validation summary
|
|
if: always()
|
|
run: |
|
|
REPORT="/tmp/xrpld-validation/reports/validation-report.json"
|
|
if [ -f "$REPORT" ]; then
|
|
echo "## Telemetry Validation Results" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
TOTAL=$(jq '.summary.total' "$REPORT")
|
|
PASSED=$(jq '.summary.passed' "$REPORT")
|
|
FAILED=$(jq '.summary.failed' "$REPORT")
|
|
echo "| Metric | Value |" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "|--------|-------|" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "| Total Checks | $TOTAL |" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "| Passed | $PASSED |" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "| Failed | $FAILED |" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
if [ "$FAILED" -gt 0 ]; then
|
|
echo "### Failed Checks" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
jq -r '.checks[] | select(.passed == false) | "- **\(.name)**: \(.message)"' "$REPORT" >> "$GITHUB_STEP_SUMMARY"
|
|
fi
|
|
fi
|
|
|
|
# Publishes captured OTel timings + regression report to the Step Summary.
|
|
# When the committed baseline is a placeholder, emits a fenced JSON block
|
|
# that can be copy-pasted directly into baselines/baseline-timings.json.
|
|
# When the baseline is populated, summarises the top regressions so the
|
|
# PR author sees the failure reason without downloading artifacts.
|
|
- name: Print regression summary
|
|
if: always()
|
|
run: |
|
|
set -euo pipefail
|
|
TIMINGS="/tmp/xrpld-validation/reports/timings.json"
|
|
REGRESSION="/tmp/xrpld-validation/reports/regression-report.json"
|
|
BASELINE="docker/telemetry/workload/baselines/baseline-timings.json"
|
|
|
|
if [ ! -f "$TIMINGS" ]; then
|
|
echo "## Regression Gate: no timings captured" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "::warning::capture_timings.py did not produce timings.json — regression gate was not evaluated."
|
|
exit 0
|
|
fi
|
|
|
|
if [ ! -f "$BASELINE" ]; then
|
|
echo "## Regression Gate: baseline file missing" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "::error::baselines/baseline-timings.json not found in checkout"
|
|
exit 1
|
|
fi
|
|
|
|
IS_PLACEHOLDER=$(jq -e -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE") || {
|
|
echo "::error::Failed to parse baseline JSON"
|
|
exit 1
|
|
}
|
|
|
|
echo "## OTel Timings Regression Gate" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
if [ "$IS_PLACEHOLDER" = "true" ]; then
|
|
echo "### Paste into \`baselines/baseline-timings.json\`" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "The committed baseline is a placeholder. Open a PR replacing" \
|
|
"its contents with the JSON block below to activate the" \
|
|
"regression gate." >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
echo '```json' >> "$GITHUB_STEP_SUMMARY"
|
|
cat "$TIMINGS" >> "$GITHUB_STEP_SUMMARY"
|
|
echo '```' >> "$GITHUB_STEP_SUMMARY"
|
|
elif [ -f "$REGRESSION" ]; then
|
|
REGR_COUNT=$(jq -e '.summary.regressions' "$REGRESSION") || REGR_COUNT=0
|
|
IMPR_COUNT=$(jq -e '.summary.improvements' "$REGRESSION") || IMPR_COUNT=0
|
|
TOTAL=$(jq -e '.summary.total' "$REGRESSION") || TOTAL=0
|
|
echo "| Stat | Count |" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "|------|-------|" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "| Metrics compared | $TOTAL |" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "| Regressions | $REGR_COUNT |" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "| Improvements | $IMPR_COUNT |" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
if [ "$REGR_COUNT" -gt 0 ]; then
|
|
echo "### Regressions" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "| Metric | Baseline | Current | Δ | % | Unit |" >> "$GITHUB_STEP_SUMMARY"
|
|
echo "|--------|---------:|--------:|--:|--:|------|" >> "$GITHUB_STEP_SUMMARY"
|
|
jq -r '.metrics[] | select(.regressed) | "| \(.key) | \(.baseline) | \(.current) | \(.delta) | \(.pct_change)% | \(.unit) |"' \
|
|
"$REGRESSION" >> "$GITHUB_STEP_SUMMARY"
|
|
fi
|
|
fi
|
|
|
|
- name: Cleanup
|
|
if: always()
|
|
run: |
|
|
docker/telemetry/workload/run-full-validation.sh --cleanup 2>/dev/null || true
|
|
|
|
- name: Check validation result
|
|
if: steps.validation.outcome == 'failure'
|
|
run: |
|
|
echo "Telemetry validation failed. Check the uploaded reports for details."
|
|
exit 1
|