rippled/.github/workflows/telemetry-validation.yml

# Telemetry Validation CI Workflow
#
# Builds rippled with telemetry enabled, runs the multi-node workload
# harness, validates all telemetry data, and runs performance benchmarks.
#
# This is a separate workflow from the main CI. It runs:
#   - On manual dispatch (workflow_dispatch)
#   - On pushes to telemetry-related branches
#
# The workflow is intentionally heavyweight (builds rippled, starts Docker
# services, runs a multi-node cluster) — it validates the full telemetry
# stack end-to-end rather than individual unit tests.
#
# Architecture: two jobs to leverage cached dependencies:
#   1. build-xrpld — runs on a self-hosted runner inside the same container
#      image the main CI uses (debian-bookworm-gcc-13). This ensures Conan
#      packages are fetched from the XRPLF remote instead of built from
#      source, and ccache hits the remote cache.
#   2. validate-telemetry — runs on ubuntu-latest (which has Docker) to
#      launch the telemetry stack (OTel collector, Prometheus, Tempo, etc.)
#      and validate the full pipeline end-to-end.

name: Telemetry Validation

on:
  workflow_dispatch:
    inputs:
      rpc_rate:
        description: "RPC load rate (requests per second)"
        required: false
        default: "50"
      rpc_duration:
        description: "RPC load duration (seconds)"
        required: false
        default: "120"
      tx_tps:
        description: "Transaction submit rate (TPS)"
        required: false
        default: "5"
      tx_duration:
        description: "Transaction submit duration (seconds)"
        required: false
        default: "120"
      run_benchmark:
        description: "Run performance benchmarks"
        required: false
        type: boolean
        default: false

  push:
    branches:
      - "pratik/otel-phase*"
      - "feature/otel-*"
      - "feature/telemetry-*"
    paths:
      - ".github/workflows/telemetry-validation.yml"
      - "docker/telemetry/**"
      - "include/xrpl/basics/Telemetry*.h"
      - "src/xrpld/app/misc/Telemetry*"

concurrency:
  group: telemetry-validation-${{ github.ref }}
  cancel-in-progress: true

defaults:
  run:
    shell: bash

env:
  BUILD_DIR: build

jobs:
  # ── Job 1: Build xrpld in the same container the main CI uses ──────
  # This ensures Conan binary packages are fetched from the XRPLF remote
  # (matching package IDs) and ccache hits the remote compilation cache.
  build-xrpld:
    name: Build xrpld
    runs-on: [self-hosted, Linux, X64, heavy]
    container: ghcr.io/xrplf/ci/debian-bookworm:gcc-13-sha-ab4d1f0
    timeout-minutes: 60
    env:
      CCACHE_NAMESPACE: telemetry-validation
      CCACHE_REMOTE_ONLY: true
      CCACHE_REMOTE_STORAGE: http://cache.dev.ripplex.io:8080|layout=bazel
      CCACHE_SLOPPINESS: include_file_ctime,include_file_mtime
    steps:
      - name: Checkout repository
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Prepare runner
        uses: XRPLF/actions/prepare-runner@2cbf481018d930656e9276fcc20dc0e3a0be5b6d
        with:
          enable_ccache: ${{ github.repository_owner == 'XRPLF' }}

      - name: Print build environment
        uses: ./.github/actions/print-env

      - name: Get number of processors
        uses: XRPLF/actions/get-nproc@cf0433aa74563aead044a1e395610c96d65a37cf
        id: nproc
        with:
          subtract: 2

      - name: Setup Conan
        uses: ./.github/actions/setup-conan

      - name: Build dependencies
        uses: ./.github/actions/build-deps
        with:
          build_nproc: ${{ steps.nproc.outputs.nproc }}
          build_type: Release
          log_verbosity: verbose

      - name: Configure CMake
        working-directory: ${{ env.BUILD_DIR }}
        run: |
          cmake \
            -G Ninja \
            -DCMAKE_TOOLCHAIN_FILE:FILEPATH=build/generators/conan_toolchain.cmake \
            -DCMAKE_BUILD_TYPE=Release \
            ..

      - name: Build xrpld
        working-directory: ${{ env.BUILD_DIR }}
        env:
          BUILD_NPROC: ${{ steps.nproc.outputs.nproc }}
        run: |
          cmake \
            --build . \
            --config Release \
            --parallel "${BUILD_NPROC}" \
            --target xrpld

      - name: Show ccache statistics
        if: ${{ github.repository_owner == 'XRPLF' }}
        run: ccache --show-stats -vv

      - name: Upload xrpld binary
        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
        with:
          name: xrpld-telemetry
          path: ${{ env.BUILD_DIR }}/xrpld
          retention-days: 1
          if-no-files-found: error

  # ── Job 2: Run telemetry validation on ubuntu-latest (has Docker) ──
  validate-telemetry:
    name: Telemetry Stack Validation
    needs: build-xrpld
    runs-on: ubuntu-latest
    timeout-minutes: 30
    steps:
      - name: Checkout repository
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install Python dependencies
        run: pip3 install -r docker/telemetry/workload/requirements.txt

      - name: Download xrpld binary
        uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
        with:
          name: xrpld-telemetry
          path: ${{ env.BUILD_DIR }}

      - name: Make binaries and scripts executable
        run: |
          chmod +x ${{ env.BUILD_DIR }}/xrpld
          chmod +x docker/telemetry/workload/*.sh

      - name: Run full telemetry validation
        id: validation
        env:
          RPC_RATE: ${{ github.event.inputs.rpc_rate || '50' }}
          RPC_DURATION: ${{ github.event.inputs.rpc_duration || '120' }}
          TX_TPS: ${{ github.event.inputs.tx_tps || '5' }}
          TX_DURATION: ${{ github.event.inputs.tx_duration || '120' }}
          RUN_BENCHMARK: ${{ github.event.inputs.run_benchmark }}
        run: |
          ARGS="--xrpld ${{ env.BUILD_DIR }}/xrpld --skip-loki"
          ARGS="$ARGS --rpc-rate $RPC_RATE"
          ARGS="$ARGS --rpc-duration $RPC_DURATION"
          ARGS="$ARGS --tx-tps $TX_TPS"
          ARGS="$ARGS --tx-duration $TX_DURATION"
          if [ "$RUN_BENCHMARK" = "true" ]; then
            ARGS="$ARGS --with-benchmark"
          fi
          docker/telemetry/workload/run-full-validation.sh $ARGS
        # continue-on-error allows subsequent steps (artifact upload,
        # summary printing) to run even if validation fails. The final
        # "Check validation result" step re-checks steps.validation.outcome
        # (the pre-continue-on-error result) and fails the job properly.
        continue-on-error: true

      - name: Upload validation reports
        if: always()
        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
        with:
          name: telemetry-validation-reports
          path: /tmp/xrpld-validation/reports/
          retention-days: 30

      - name: Upload node logs
        if: failure()
        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
        with:
          name: xrpld-node-logs
          path: /tmp/xrpld-validation/node*/debug.log
          retention-days: 7

      - name: Print validation summary
        if: always()
        run: |
          REPORT="/tmp/xrpld-validation/reports/validation-report.json"
          if [ -f "$REPORT" ]; then
            echo "## Telemetry Validation Results" >> "$GITHUB_STEP_SUMMARY"
            echo "" >> "$GITHUB_STEP_SUMMARY"
            TOTAL=$(jq '.summary.total' "$REPORT")
            PASSED=$(jq '.summary.passed' "$REPORT")
            FAILED=$(jq '.summary.failed' "$REPORT")
            echo "| Metric | Value |" >> "$GITHUB_STEP_SUMMARY"
            echo "|--------|-------|" >> "$GITHUB_STEP_SUMMARY"
            echo "| Total Checks | $TOTAL |" >> "$GITHUB_STEP_SUMMARY"
            echo "| Passed | $PASSED |" >> "$GITHUB_STEP_SUMMARY"
            echo "| Failed | $FAILED |" >> "$GITHUB_STEP_SUMMARY"
            echo "" >> "$GITHUB_STEP_SUMMARY"
            if [ "$FAILED" -gt 0 ]; then
              echo "### Failed Checks" >> "$GITHUB_STEP_SUMMARY"
              echo "" >> "$GITHUB_STEP_SUMMARY"
              jq -r '.checks[] | select(.passed == false) | "- **\(.name)**: \(.message)"' "$REPORT" >> "$GITHUB_STEP_SUMMARY"
            fi
          fi

      # Publishes captured OTel timings + regression report to the Step Summary.
      # When the committed baseline is a placeholder, emits a fenced JSON block
      # that can be copy-pasted directly into baselines/baseline-timings.json.
      # When the baseline is populated, summarises the top regressions so the
      # PR author sees the failure reason without downloading artifacts.
      - name: Print regression summary
        if: always()
        run: |
          set -euo pipefail
          TIMINGS="/tmp/xrpld-validation/reports/timings.json"
          REGRESSION="/tmp/xrpld-validation/reports/regression-report.json"
          BASELINE="docker/telemetry/workload/baselines/baseline-timings.json"

          if [ ! -f "$TIMINGS" ]; then
            echo "## Regression Gate: no timings captured" >> "$GITHUB_STEP_SUMMARY"
            echo "::warning::capture_timings.py did not produce timings.json — regression gate was not evaluated."
            exit 0
          fi

          if [ ! -f "$BASELINE" ]; then
            echo "## Regression Gate: baseline file missing" >> "$GITHUB_STEP_SUMMARY"
            echo "::error::baselines/baseline-timings.json not found in checkout"
            exit 1
          fi

          IS_PLACEHOLDER=$(jq -e -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE") || {
            echo "::error::Failed to parse baseline JSON"
            exit 1
          }

          echo "## OTel Timings Regression Gate" >> "$GITHUB_STEP_SUMMARY"
          echo "" >> "$GITHUB_STEP_SUMMARY"

          if [ "$IS_PLACEHOLDER" = "true" ]; then
            echo "### Paste into \`baselines/baseline-timings.json\`" >> "$GITHUB_STEP_SUMMARY"
            echo "" >> "$GITHUB_STEP_SUMMARY"
            echo "The committed baseline is a placeholder. Open a PR replacing" \
                 "its contents with the JSON block below to activate the" \
                 "regression gate." >> "$GITHUB_STEP_SUMMARY"
            echo "" >> "$GITHUB_STEP_SUMMARY"
            echo '```json' >> "$GITHUB_STEP_SUMMARY"
            cat "$TIMINGS" >> "$GITHUB_STEP_SUMMARY"
            echo '```' >> "$GITHUB_STEP_SUMMARY"
          elif [ -f "$REGRESSION" ]; then
            REGR_COUNT=$(jq -e '.summary.regressions' "$REGRESSION") || REGR_COUNT=0
            IMPR_COUNT=$(jq -e '.summary.improvements' "$REGRESSION") || IMPR_COUNT=0
            TOTAL=$(jq -e '.summary.total' "$REGRESSION") || TOTAL=0
            echo "| Stat | Count |" >> "$GITHUB_STEP_SUMMARY"
            echo "|------|-------|" >> "$GITHUB_STEP_SUMMARY"
            echo "| Metrics compared | $TOTAL |" >> "$GITHUB_STEP_SUMMARY"
            echo "| Regressions | $REGR_COUNT |" >> "$GITHUB_STEP_SUMMARY"
            echo "| Improvements | $IMPR_COUNT |" >> "$GITHUB_STEP_SUMMARY"
            echo "" >> "$GITHUB_STEP_SUMMARY"
            if [ "$REGR_COUNT" -gt 0 ]; then
              echo "### Regressions" >> "$GITHUB_STEP_SUMMARY"
              echo "" >> "$GITHUB_STEP_SUMMARY"
              echo "| Metric | Baseline | Current | Δ | % | Unit |" >> "$GITHUB_STEP_SUMMARY"
              echo "|--------|---------:|--------:|--:|--:|------|" >> "$GITHUB_STEP_SUMMARY"
              jq -r '.metrics[] | select(.regressed) | "| \(.key) | \(.baseline) | \(.current) | \(.delta) | \(.pct_change)% | \(.unit) |"' \
                "$REGRESSION" >> "$GITHUB_STEP_SUMMARY"
            fi
          fi

      - name: Cleanup
        if: always()
        run: |
          docker/telemetry/workload/run-full-validation.sh --cleanup 2>/dev/null || true

      - name: Check validation result
        if: steps.validation.outcome == 'failure'
        run: |
          echo "Telemetry validation failed. Check the uploaded reports for details."
          exit 1