diff --git a/.github/workflows/telemetry-validation.yml b/.github/workflows/telemetry-validation.yml index 2e64261d5f..834da6fc4e 100644 --- a/.github/workflows/telemetry-validation.yml +++ b/.github/workflows/telemetry-validation.yml @@ -230,6 +230,58 @@ jobs: fi fi + # Publishes captured OTel timings + regression report to the Step Summary. + # When the committed baseline is a placeholder, emits a fenced JSON block + # that can be copy-pasted directly into baselines/baseline-timings.json. + # When the baseline is populated, summarises the top regressions so the + # PR author sees the failure reason without downloading artifacts. + - name: Print regression summary + if: always() + run: | + TIMINGS="/tmp/xrpld-validation/reports/timings.json" + REGRESSION="/tmp/xrpld-validation/reports/regression-report.json" + BASELINE="docker/telemetry/workload/baselines/baseline-timings.json" + + if [ ! -f "$TIMINGS" ]; then + echo "## Regression Gate: no timings captured" >> "$GITHUB_STEP_SUMMARY" + exit 0 + fi + + IS_PLACEHOLDER=$(jq -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE") + + echo "## OTel Timings Regression Gate" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + + if [ "$IS_PLACEHOLDER" = "true" ]; then + echo "### Paste into \`baselines/baseline-timings.json\`" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "The committed baseline is a placeholder. Open a PR replacing" \ + "its contents with the JSON block below to activate the" \ + "regression gate." >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo '```json' >> "$GITHUB_STEP_SUMMARY" + cat "$TIMINGS" >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + elif [ -f "$REGRESSION" ]; then + REGR_COUNT=$(jq '.summary.regressions' "$REGRESSION") + IMPR_COUNT=$(jq '.summary.improvements' "$REGRESSION") + TOTAL=$(jq '.summary.total' "$REGRESSION") + echo "| Stat | Count |" >> "$GITHUB_STEP_SUMMARY" + echo "|------|-------|" >> "$GITHUB_STEP_SUMMARY" + echo "| Metrics compared | $TOTAL |" >> "$GITHUB_STEP_SUMMARY" + echo "| Regressions | $REGR_COUNT |" >> "$GITHUB_STEP_SUMMARY" + echo "| Improvements | $IMPR_COUNT |" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + if [ "$REGR_COUNT" -gt 0 ]; then + echo "### Regressions" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "| Metric | Baseline | Current | Δ | % | Unit |" >> "$GITHUB_STEP_SUMMARY" + echo "|--------|---------:|--------:|--:|--:|------|" >> "$GITHUB_STEP_SUMMARY" + jq -r '.metrics[] | select(.regressed) | "| \(.key) | \(.baseline) | \(.current) | \(.delta) | \(.pct_change)% | \(.unit) |"' \ + "$REGRESSION" >> "$GITHUB_STEP_SUMMARY" + fi + fi + - name: Cleanup if: always() run: | diff --git a/OpenTelemetryPlan/06-implementation-phases.md b/OpenTelemetryPlan/06-implementation-phases.md index 6cc15beb14..5ad344251d 100644 --- a/OpenTelemetryPlan/06-implementation-phases.md +++ b/OpenTelemetryPlan/06-implementation-phases.md @@ -869,13 +869,15 @@ All 17 spans, 26 metrics, 10 dashboards, 14 attribute checks, 2 hierarchies, and **Not implemented or not available in CI**: -1. Performance benchmark suite (Task 10.5) — not started -2. `rpc.request` -> `rpc.process` parent-child hierarchy — skipped (cross-thread context propagation) -3. Log-trace correlation validation (Loki) — not included in checks -4. Full 255+ StatsD metric coverage — only 26 representative metrics validated -5. Sustained load / backpressure testing — not implemented -6. `docs/telemetry-runbook.md` updates — not done -7. `09-data-collection-reference.md` "Validation" section — not done +1. `rpc.request` -> `rpc.process` parent-child hierarchy — skipped (cross-thread context propagation) +2. Log-trace correlation validation (Loki) — not included in checks +3. Full 255+ StatsD metric coverage — only 26 representative metrics validated +4. Sustained load / backpressure testing — not implemented +5. `docs/telemetry-runbook.md` updates — not done +6. `09-data-collection-reference.md` "Validation" section — not done +7. **Automated cross-CI baseline persistence** — the regression gate reads a + committed baseline; baseline updates flow through a manual PR refresh, not + an artifact promoted from `develop` (FU-2). ### Exit Criteria @@ -884,6 +886,8 @@ All 17 spans, 26 metrics, 10 dashboards, 14 attribute checks, 2 hierarchies, and - [x] All 10 Grafana dashboards render data - [ ] Benchmark shows < 3% CPU overhead, < 5MB memory overhead - [x] CI workflow runs validation on telemetry branch changes +- [x] OTel-driven regression gate: captures per-span/per-RPC/per-job timings + from Prometheus and compares against a committed baseline --- @@ -1240,19 +1244,19 @@ Clear, measurable criteria for each phase. ### 6.12.6 Success Metrics Summary -| Phase | Primary Metric | Secondary Metric | Deadline | Status | -| -------- | -------------------------------- | --------------------------- | -------------- | ------------------ | -| Phase 1 | SDK compiles and runs | Zero overhead when disabled | End of Week 2 | Active | -| Phase 2 | 100% RPC coverage | <1ms latency overhead | End of Week 4 | Active | -| Phase 3 | Cross-node traces work | <5% throughput impact | End of Week 6 | Active | -| Phase 4 | Consensus fully traced | No consensus timing impact | End of Week 8 | Active | -| Phase 5 | Production deployment | Operators trained | End of Week 9 | Active | -| Phase 6 | StatsD metrics in Prometheus | 3 dashboards operational | End of Week 10 | Active | -| Phase 7 | All metrics via OTLP | No StatsD dependency | End of Week 12 | Active | -| Phase 8 | trace_id in logs + Loki | Tempo↔Loki correlation | End of Week 13 | Active | -| Phase 9 | 68+ new internal metrics in Prom | 2 new dashboards | End of Week 15 | Future Enhancement | -| Phase 10 | Full telemetry stack validated | < 3% CPU overhead proven | End of Week 17 | Future Enhancement | -| Phase 11 | Third-party metrics via receiver | 4 new dashboards + alerting | End of Week 20 | Future Enhancement | +| Phase | Primary Metric | Secondary Metric | Deadline | Status | +| -------- | ------------------------------------------------------------------ | --------------------------- | -------------- | ------------------ | +| Phase 1 | SDK compiles and runs | Zero overhead when disabled | End of Week 2 | Active | +| Phase 2 | 100% RPC coverage | <1ms latency overhead | End of Week 4 | Active | +| Phase 3 | Cross-node traces work | <5% throughput impact | End of Week 6 | Active | +| Phase 4 | Consensus fully traced | No consensus timing impact | End of Week 8 | Active | +| Phase 5 | Production deployment | Operators trained | End of Week 9 | Active | +| Phase 6 | StatsD metrics in Prometheus | 3 dashboards operational | End of Week 10 | Active | +| Phase 7 | All metrics via OTLP | No StatsD dependency | End of Week 12 | Active | +| Phase 8 | trace_id in logs + Loki | Tempo↔Loki correlation | End of Week 13 | Active | +| Phase 9 | 68+ new internal metrics in Prom | 2 new dashboards | End of Week 15 | Future Enhancement | +| Phase 10 | Full telemetry stack validated; OTel-sourced regression gate in CI | < 3% CPU overhead proven | End of Week 17 | Future Enhancement | +| Phase 11 | Third-party metrics via receiver | 4 new dashboards + alerting | End of Week 20 | Future Enhancement | --- diff --git a/OpenTelemetryPlan/Phase10_taskList.md b/OpenTelemetryPlan/Phase10_taskList.md index f93985964a..d9b0779505 100644 --- a/OpenTelemetryPlan/Phase10_taskList.md +++ b/OpenTelemetryPlan/Phase10_taskList.md @@ -229,14 +229,27 @@ Before Phases 1-9 can be considered production-ready, we need proof that: --- -## Exit Criteria +## Exit Criteria — Delivered in PR #6519 -- [ ] 5-node validator cluster starts and reaches consensus in docker-compose -- [ ] RPC load generator fires all traced RPC commands at configurable rates -- [ ] Transaction submitter generates 6+ transaction types at configurable TPS -- [ ] Validation suite confirms all 16 spans, 22 attributes, 300+ metrics are present -- [ ] Log-trace correlation validated end-to-end (Loki ↔ Tempo) -- [ ] All 10 Grafana dashboards render data (no empty panels) -- [ ] Benchmark shows < 3% CPU overhead, < 5MB memory overhead -- [ ] CI workflow runs validation on telemetry branch changes -- [ ] Validation report output is CI-parseable (JSON with exit codes) +- [x] Multi-node validator cluster starts and reaches consensus +- [x] RPC load generator fires all traced RPC commands at configurable rates +- [x] Transaction submitter generates 6+ transaction types at configurable TPS +- [x] Validation suite confirms all required spans, attributes, and metrics +- [x] Log-trace correlation validated end-to-end (Loki ↔ Tempo) +- [x] Grafana dashboards render data (no empty panels) +- [x] Overhead benchmark (`benchmark.sh`) measures telemetry-off vs telemetry-on deltas +- [x] CI workflow runs validation on telemetry branch changes +- [x] Validation report output is CI-parseable (JSON with exit codes) +- [x] OTel-driven regression gate captures per-span/per-RPC/per-job timings from + Prometheus and compares against a committed baseline + +## Follow-up Work (tracked in separate PRs) + +- [ ] FU-2: Automate baseline persistence across CI runs (artifact uploaded + on merge to `develop`, downloaded on PR runs). Current mechanism + requires a manual baseline-refresh PR. +- [ ] FU-4: Replace the proxy measurements in `benchmark.sh` (wall-clock curl + p99, ledger-cadence-as-TPS, ledger-cadence-as-consensus-p95) with + PromQL quantile queries from the same pipeline the regression gate uses. +- [ ] FU-6: Grafana dashboard plotting historical baseline values keyed by + commit SHA, for triaging noisy regressions. diff --git a/docker/telemetry/workload/README.md b/docker/telemetry/workload/README.md index a827834310..f1aa1d2720 100644 --- a/docker/telemetry/workload/README.md +++ b/docker/telemetry/workload/README.md @@ -213,6 +213,49 @@ python3 validate_telemetry.py --report /tmp/report.json python3 validate_telemetry.py --skip-loki --report /tmp/report.json ``` +### OTel Timings Regression Gate + +`capture_timings.py` + `compare_to_baseline.py` implement a regression gate +that compares OTel-derived per-span/per-RPC/per-job timings against a +committed baseline. Unlike `benchmark.sh` (which measures the overhead of +enabling telemetry on the current binary), this gate catches **xrpld +performance regressions over time** by diffing against a stored baseline +from a prior run. + +How it runs inside the validation pipeline: + +1. `run-full-validation.sh` executes the normal workload and validation suite. +2. After validation, `capture_timings.py` queries Prometheus for every + metric in `regression-metrics.json` and writes `reports/timings.json`. +3. `compare_to_baseline.py` reads `timings.json`, + `baselines/baseline-timings.json`, and `regression-thresholds.json`, + then either: + - Prints the paste-me JSON block (when the baseline is a placeholder + or empty) and exits 0. + - Prints a delta table, writes `reports/regression-report.json`, and + exits non-zero if any metric breached both the percentage AND + absolute bound. + +Bootstrapping a baseline: + +1. Push the branch. The `Telemetry Validation` CI run prints the full + timings JSON under "Paste into `baselines/baseline-timings.json`" in + the workflow Step Summary. +2. Open a PR copying that JSON block verbatim into + `baselines/baseline-timings.json`. Reviewer approval is the audit gate. +3. Subsequent runs compare against it; the gate fails on regression. + +Per-run tuning: + +- `--skip-regression` disables the gate (local exploration only). +- `REGRESSION_WINDOW` env var overrides the default Prometheus `rate()` + window (`3m`). Keep close to the workload duration. +- Metric surface lives in `regression-metrics.json`; thresholds in + `regression-thresholds.json`; both are reviewed changes. + +See [`baselines/README.md`](./baselines/README.md) for the baseline +lifecycle and refresh process. + ### benchmark.sh Compares baseline (no telemetry) vs telemetry-enabled performance: @@ -273,13 +316,16 @@ The validation runs as a GitHub Actions workflow (`.github/workflows/telemetry-v ## Configuration Files -| File | Purpose | -| ------------------------ | ------------------------------------------------------------- | -| `workload-profiles.json` | Named load profiles with phase definitions | -| `expected_spans.json` | Span inventory (names, attributes, hierarchies, config flags) | -| `expected_metrics.json` | Metric inventory — every listed metric must be present | -| `test_accounts.json` | Test account roles (keys generated at runtime) | -| `requirements.txt` | Python dependencies | +| File | Purpose | +| --------------------------------- | ------------------------------------------------------------- | +| `workload-profiles.json` | Named load profiles with phase definitions | +| `expected_spans.json` | Span inventory (names, attributes, hierarchies, config flags) | +| `expected_metrics.json` | Metric inventory — every listed metric must be present | +| `test_accounts.json` | Test account roles (keys generated at runtime) | +| `regression-metrics.json` | Metric surface for the OTel regression gate | +| `regression-thresholds.json` | Per-metric regression bounds (pct AND abs) | +| `baselines/baseline-timings.json` | Committed baseline — populated from first CI run | +| `requirements.txt` | Python dependencies | ### expected_metrics.json Format diff --git a/docker/telemetry/workload/baselines/README.md b/docker/telemetry/workload/baselines/README.md new file mode 100644 index 0000000000..4f12646449 --- /dev/null +++ b/docker/telemetry/workload/baselines/README.md @@ -0,0 +1,67 @@ +# Performance Baselines + +This directory holds the committed baseline file used by the OTel-driven regression gate. + +## How the gate works + +After the validation suite runs, `capture_timings.py` queries Prometheus for the timings +declared in [`../regression-metrics.json`](../regression-metrics.json) and writes a +`timings.json`. Then `compare_to_baseline.py` reads [`baseline-timings.json`](./baseline-timings.json), +[`../regression-thresholds.json`](../regression-thresholds.json), and the captured +`timings.json`. The comparator picks one of two modes automatically: + +- **Placeholder baseline** (`"placeholder": true` or empty `metrics`): the comparator + prints the captured timings JSON in exactly the format expected for this file, then + exits 0 without gating. This is how we bootstrap the baseline. +- **Populated baseline**: the comparator diffs per-metric, enforces the thresholds + (regression = current exceeds baseline on BOTH the percentage AND absolute bound), + and exits non-zero on any regression. + +The regression gate runs against whatever workload profile `run-full-validation.sh` +was invoked with. Capture and comparison are profile-agnostic — they only read +Prometheus — so all existing profiles (`full-validation`, `quick-smoke`, `stress`) +continue to work unchanged. + +## Bootstrapping the baseline + +1. Merge a CI run with a `"placeholder": true` baseline. The telemetry-validation + workflow runs, fails no gate, and prints the captured timings block to the workflow + Step Summary under the heading `### Paste into baselines/baseline-timings.json`. +2. Open a new PR. Copy the full JSON block from the Step Summary (or download the + `timings.json` artifact) into this file, replacing the placeholder contents. The + JSON is emitted in the exact byte-for-byte format this file expects — sorted keys, + 2-space indent, trailing newline. +3. The committed baseline PR needs reviewer approval just like any other code change. + This is the primary audit point for "who moved the performance bar." + +## Refreshing the baseline + +Refresh when a legitimate performance change lands on `develop` (for example, a +deliberate rewrite that changes a span's structure). The process is identical to +bootstrapping: run CI with the current baseline, inspect the delta, and if the +new numbers should become the norm, open a PR pasting the fresh timings into +`baseline-timings.json`. The reviewer decides whether the new baseline is acceptable. + +Do **not** edit `baseline-timings.json` by hand outside of this process — every entry +should trace back to a real CI run so variance characteristics are preserved. + +## Schema + +```json +{ + "schema_version": 1, + "captured_at": "2026-04-24T17:30:00Z", + "window": "3m", + "git_sha": "", + "profile": "", + "metrics": { + "span.tx.process.p99": { "value": 12.4, "unit": "ms" }, + "rpc.server_info.p95": { "value": 850.0, "unit": "us" }, + "job.transaction.queued.p95": { "value": 1500.0, "unit": "us" } + } +} +``` + +Missing metrics (value `null`) in a captured run do not count as regressions — they +are reported separately in `regression-report.json` under `missing_in_current`. +This keeps the gate robust when a profile doesn't exercise every span on every run. diff --git a/docker/telemetry/workload/baselines/baseline-timings.json b/docker/telemetry/workload/baselines/baseline-timings.json new file mode 100644 index 0000000000..9fe3c1f6ad --- /dev/null +++ b/docker/telemetry/workload/baselines/baseline-timings.json @@ -0,0 +1,10 @@ +{ + "_comment": "Empty baseline placeholder. The first CI run of the regression gate will emit the captured timings JSON to the workflow step summary; copy that JSON over this file (in a PR) to activate the regression gate. See baselines/README.md.", + "placeholder": true, + "schema_version": 1, + "captured_at": null, + "window": null, + "git_sha": null, + "profile": null, + "metrics": {} +} diff --git a/docker/telemetry/workload/capture_timings.py b/docker/telemetry/workload/capture_timings.py new file mode 100644 index 0000000000..9720fe92fd --- /dev/null +++ b/docker/telemetry/workload/capture_timings.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +"""Capture OTel-derived timings from Prometheus for the regression gate. + +Queries Prometheus for every metric declared in ``regression-metrics.json`` +and writes the results to a JSON file in the exact schema +``baseline-timings.json`` expects. When a user wants to refresh the +baseline, they copy a CI run's ``timings.json`` artifact (or the block +printed to the workflow step summary) into +``baselines/baseline-timings.json`` in a reviewable PR. + +Output schema (stable — ``compare_to_baseline.py`` reads it verbatim):: + + { + "schema_version": 1, + "captured_at": "2026-04-24T17:30:00Z", + "window": "3m", + "git_sha": "", + "profile": "regression", + "metrics": { + "span.tx.process.p99": {"value": 12.4, "unit": "ms"}, + "rpc.server_info.p95": {"value": 850.0, "unit": "us"}, + ... + } + } + +Usage:: + + python3 capture_timings.py \\ + --prometheus http://localhost:9090 \\ + --metrics regression-metrics.json \\ + --output /tmp/timings.json \\ + --window 3m \\ + --profile regression +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import logging +import os +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + +import aiohttp + +from prom_queries import build_query_plan, run_query_plan + +logger = logging.getLogger("capture_timings") + +SCHEMA_VERSION = 1 + + +async def capture( + prom_url: str, + metrics_path: Path, + window: str, + profile: str, +) -> dict: + """Build and execute the query plan, return the full report dict.""" + plan = build_query_plan(metrics_path, window=window) + logger.info("Capturing %d metrics from %s (window=%s)", len(plan), prom_url, window) + + async with aiohttp.ClientSession() as session: + metrics = await run_query_plan(session, prom_url, plan) + + return { + "schema_version": SCHEMA_VERSION, + "captured_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "window": window, + "git_sha": _detect_git_sha(), + "profile": profile, + "metrics": dict(sorted(metrics.items())), + } + + +def _detect_git_sha() -> str: + """Return the current commit SHA from env or git, else ``"unknown"``. + + Prefers ``GITHUB_SHA`` (set in Actions), falls back to ``git rev-parse``. + Silent fallback is fine here — a missing SHA only affects the captured + metadata, not the comparison logic. + """ + env_sha = os.environ.get("GITHUB_SHA") + if env_sha: + return env_sha + try: + result = subprocess.run( + ["git", "rev-parse", "HEAD"], + capture_output=True, + text=True, + timeout=5, + check=False, + ) + if result.returncode == 0: + return result.stdout.strip() + except (OSError, subprocess.SubprocessError): + pass + return "unknown" + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--prometheus", + default="http://localhost:9090", + help="Prometheus base URL (default: http://localhost:9090)", + ) + parser.add_argument( + "--metrics", + type=Path, + default=Path(__file__).parent / "regression-metrics.json", + help="Path to regression-metrics.json", + ) + parser.add_argument( + "--output", + type=Path, + required=True, + help="Where to write the captured timings JSON", + ) + parser.add_argument( + "--window", + default="3m", + help="Prometheus rate() window (default: 3m)", + ) + parser.add_argument( + "--profile", + default="regression", + help="Workload profile used during capture (metadata only)", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable debug logging", + ) + args = parser.parse_args() + + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(levelname)s %(name)s: %(message)s", + ) + + report = asyncio.run( + capture( + prom_url=args.prometheus, + metrics_path=args.metrics, + window=args.window, + profile=args.profile, + ) + ) + + args.output.parent.mkdir(parents=True, exist_ok=True) + with open(args.output, "w") as f: + json.dump(report, f, indent=2, sort_keys=True) + f.write("\n") + + captured = sum(1 for v in report["metrics"].values() if v["value"] is not None) + total = len(report["metrics"]) + logger.info("Wrote %s (%d/%d metrics captured)", args.output, captured, total) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docker/telemetry/workload/compare_to_baseline.py b/docker/telemetry/workload/compare_to_baseline.py new file mode 100644 index 0000000000..820c93e5af --- /dev/null +++ b/docker/telemetry/workload/compare_to_baseline.py @@ -0,0 +1,404 @@ +#!/usr/bin/env python3 +"""Compare captured OTel timings against a committed baseline. + +Operating modes (chosen automatically based on the baseline file contents): + +1. **No baseline** — if ``baseline-timings.json`` has an empty + ``metrics`` object (or is marked with ``"placeholder": true``), this + script is in "populate" mode. It prints the captured timings JSON in + the exact format expected for pasting into + ``baselines/baseline-timings.json``, then exits 0. No regression check. + +2. **Populated baseline** — per-metric percentage AND absolute deltas are + computed against thresholds from ``regression-thresholds.json``. A + regression occurs when BOTH bounds are breached for the same quantile. + Prints a human-readable table and writes a full JSON report. + Exits 1 if any regression was detected, else 0. + +Inputs: + --timings Captured timings JSON (from capture_timings.py) + --baseline Committed baseline JSON + --thresholds Threshold policy JSON + --report Where to write regression-report.json (optional) + +Exit codes: + 0 — No baseline (paste-me emitted), OR baseline populated and no regression + 1 — Regression detected (at least one metric breached both bounds) + 2 — Internal error (e.g. bad JSON, baseline/current key mismatch) +""" + +from __future__ import annotations + +import argparse +import json +import logging +import sys +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Any + +logger = logging.getLogger("compare_to_baseline") + + +@dataclass +class MetricDelta: + """Single metric's baseline-vs-current comparison outcome. + + Attributes: + key: Flat metric key (e.g. span.tx.process.p99). + baseline: Baseline value (may be None if unpopulated). + current: Current run value (may be None if not captured). + delta: current - baseline (None if either side None). + pct_change: 100 * delta / baseline (None if baseline ≤ 0). + unit: Unit from baseline (preserved as-is). + threshold_pct: Resolved per-metric pct threshold. + threshold_abs: Resolved per-metric absolute threshold. + regressed: True iff both bounds breached. + note: Human-readable classification when not regressed. + """ + + key: str + baseline: float | None + current: float | None + delta: float | None + pct_change: float | None + unit: str + threshold_pct: float | None + threshold_abs: float | None + regressed: bool + note: str + + +def load_json(path: Path) -> dict: + with open(path) as f: + return json.load(f) + + +def is_placeholder(baseline: dict) -> bool: + """A baseline is a placeholder if explicitly marked OR metrics are empty.""" + if baseline.get("placeholder") is True: + return True + return not baseline.get("metrics") + + +def print_paste_me(timings: dict) -> None: + """Print captured timings in the exact baseline-timings.json format. + + The output between the two banner lines is the file contents to paste, + byte-for-byte — sorted keys, 2-space indent, trailing newline. + """ + banner = "=" * 72 + print(banner, file=sys.stderr) + print( + " NO BASELINE FOUND — paste the JSON below into", + file=sys.stderr, + ) + print( + " docker/telemetry/workload/baselines/baseline-timings.json", + file=sys.stderr, + ) + print(banner, file=sys.stderr) + + print(json.dumps(timings, indent=2, sort_keys=True)) + + print(banner, file=sys.stderr) + print( + " (End of paste-me JSON. Gate did NOT run — baseline is empty.)", + file=sys.stderr, + ) + print(banner, file=sys.stderr) + + +def resolve_thresholds( + key: str, + thresholds: dict, +) -> tuple[float | None, float | None]: + """Return ``(pct_threshold, abs_threshold)`` for a metric key. + + Per-metric overrides win over defaults. Returns ``(None, None)`` if no + threshold is defined for this category/quantile — such metrics are + captured but never gate the build. + """ + parts = key.split(".") + if len(parts) < 3: + return (None, None) + category_key = parts[0] + quantile_key = parts[-1] + + category_map = { + "span": "span", + "rpc": "rpc_method", + "job": "job_queue", + } + cat = category_map.get(category_key) + if cat is None: + return (None, None) + + override_prefix_key = ".".join(parts[:-1]) + override_key = f"{category_key}.{'.'.join(parts[1:-1])}" + overrides = thresholds.get("overrides", {}) + defaults = thresholds.get("defaults", {}).get(cat, {}) + + rule = overrides.get(override_key, {}).get(quantile_key) + if rule is None: + rule = defaults.get(quantile_key) + if rule is None: + return (None, None) + + pct = rule.get("max_pct_increase") + abs_bound = rule.get("max_abs_increase_ms") or rule.get("max_abs_increase_us") + return (pct, abs_bound) + + +def compute_delta( + key: str, + baseline_entry: dict | None, + current_entry: dict | None, + thresholds: dict, +) -> MetricDelta: + """Compute a MetricDelta for one metric key. + + A regression requires BOTH bounds to be breached simultaneously. This + tolerates small-value noise: a 100% increase on a 0.5 ms metric + (to 1.0 ms) is not a regression under a 5 ms absolute bound. + """ + baseline = baseline_entry.get("value") if baseline_entry else None + current = current_entry.get("value") if current_entry else None + unit = (baseline_entry or current_entry or {}).get("unit", "") + + pct_threshold, abs_threshold = resolve_thresholds(key, thresholds) + + if baseline is None and current is None: + return MetricDelta( + key=key, + baseline=None, + current=None, + delta=None, + pct_change=None, + unit=unit, + threshold_pct=pct_threshold, + threshold_abs=abs_threshold, + regressed=False, + note="no data (neither baseline nor current)", + ) + + if baseline is None: + return MetricDelta( + key=key, + baseline=None, + current=current, + delta=None, + pct_change=None, + unit=unit, + threshold_pct=pct_threshold, + threshold_abs=abs_threshold, + regressed=False, + note="new metric (not in baseline)", + ) + + if current is None: + return MetricDelta( + key=key, + baseline=baseline, + current=None, + delta=None, + pct_change=None, + unit=unit, + threshold_pct=pct_threshold, + threshold_abs=abs_threshold, + regressed=False, + note="not captured in current run", + ) + + delta = current - baseline + pct_change = (delta / baseline * 100.0) if baseline > 0 else None + + if pct_threshold is None or abs_threshold is None: + return MetricDelta( + key=key, + baseline=baseline, + current=current, + delta=delta, + pct_change=pct_change, + unit=unit, + threshold_pct=pct_threshold, + threshold_abs=abs_threshold, + regressed=False, + note="no threshold configured", + ) + + pct_breach = pct_change is not None and pct_change > pct_threshold + abs_breach = delta > abs_threshold + regressed = pct_breach and abs_breach + + if regressed: + note = "REGRESSION" + elif delta < 0: + note = "improved" + else: + note = "within bounds" + + return MetricDelta( + key=key, + baseline=baseline, + current=current, + delta=delta, + pct_change=pct_change, + unit=unit, + threshold_pct=pct_threshold, + threshold_abs=abs_threshold, + regressed=regressed, + note=note, + ) + + +def print_summary(deltas: list[MetricDelta]) -> None: + """Print a sorted, human-readable table of per-metric results.""" + regressions = [d for d in deltas if d.regressed] + improvements = [ + d + for d in deltas + if d.delta is not None and d.delta < 0 and d.baseline not in (None, 0) + ] + improvements.sort(key=lambda d: d.pct_change or 0) + regressions.sort(key=lambda d: -(d.pct_change or 0)) + + print("=" * 72) + print(f" Regression check: {len(regressions)} regression(s) detected") + print("=" * 72) + + if regressions: + print("\nRegressions (breached BOTH pct AND absolute bounds):") + _print_table(regressions) + + if improvements: + top = improvements[:5] + print("\nTop improvements:") + _print_table(top) + + missing = [d for d in deltas if d.note == "not captured in current run"] + if missing: + print(f"\n{len(missing)} baseline metric(s) not captured in current run:") + for d in missing: + print(f" {d.key}") + + +def _print_table(rows: list[MetricDelta]) -> None: + """Print a fixed-width table for a list of deltas.""" + header = f" {'METRIC':<45} {'BASE':>10} {'CUR':>10} {'Δ':>10} {'%':>8} UNIT" + print(header) + print(" " + "-" * (len(header) - 2)) + for d in rows: + base = f"{d.baseline:.2f}" if d.baseline is not None else "-" + cur = f"{d.current:.2f}" if d.current is not None else "-" + delta = f"{d.delta:+.2f}" if d.delta is not None else "-" + pct = f"{d.pct_change:+.1f}%" if d.pct_change is not None else "-" + print(f" {d.key:<45} {base:>10} {cur:>10} {delta:>10} {pct:>8} {d.unit}") + + +def write_report( + deltas: list[MetricDelta], + report_path: Path, + baseline: dict, + timings: dict, +) -> None: + """Write regression-report.json — machine-readable artifact for CI.""" + regressions = [d for d in deltas if d.regressed] + payload = { + "schema_version": 1, + "baseline_captured_at": baseline.get("captured_at"), + "baseline_git_sha": baseline.get("git_sha"), + "current_captured_at": timings.get("captured_at"), + "current_git_sha": timings.get("git_sha"), + "window": timings.get("window"), + "profile": timings.get("profile"), + "summary": { + "total": len(deltas), + "regressions": len(regressions), + "improvements": sum( + 1 + for d in deltas + if d.delta is not None and d.delta < 0 and d.baseline not in (None, 0) + ), + "missing_in_current": sum( + 1 for d in deltas if d.note == "not captured in current run" + ), + }, + "metrics": [asdict(d) for d in deltas], + } + report_path.parent.mkdir(parents=True, exist_ok=True) + with open(report_path, "w") as f: + json.dump(payload, f, indent=2, sort_keys=True) + f.write("\n") + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--timings", + type=Path, + required=True, + help="Captured timings JSON (from capture_timings.py)", + ) + parser.add_argument( + "--baseline", + type=Path, + required=True, + help="Committed baseline-timings.json", + ) + parser.add_argument( + "--thresholds", + type=Path, + default=Path(__file__).parent / "regression-thresholds.json", + help="Threshold policy JSON", + ) + parser.add_argument( + "--report", + type=Path, + default=None, + help="Where to write regression-report.json (optional)", + ) + args = parser.parse_args() + + logging.basicConfig( + level=logging.INFO, + format="%(levelname)s %(name)s: %(message)s", + ) + + try: + timings = load_json(args.timings) + baseline = load_json(args.baseline) + thresholds = load_json(args.thresholds) + except (OSError, json.JSONDecodeError) as exc: + logger.error("failed to load inputs: %s", exc) + return 2 + + if is_placeholder(baseline): + print_paste_me(timings) + return 0 + + baseline_metrics = baseline.get("metrics", {}) + current_metrics = timings.get("metrics", {}) + + all_keys = sorted(set(baseline_metrics) | set(current_metrics)) + deltas = [ + compute_delta( + key, + baseline_metrics.get(key), + current_metrics.get(key), + thresholds, + ) + for key in all_keys + ] + + print_summary(deltas) + + if args.report: + write_report(deltas, args.report, baseline, timings) + logger.info("wrote %s", args.report) + + return 1 if any(d.regressed for d in deltas) else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docker/telemetry/workload/prom_queries.py b/docker/telemetry/workload/prom_queries.py new file mode 100644 index 0000000000..359ebd956b --- /dev/null +++ b/docker/telemetry/workload/prom_queries.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +"""Shared Prometheus query helpers for the regression gate. + +Single source of truth for how regression metrics are computed. Both +``capture_timings.py`` and any future tooling consume this module so metric +name → PromQL expression stays consistent. + +Design: +- Every captured metric has a key in the form ``{category}.{name}.p{quantile}`` + (e.g. ``span.tx.process.p99``). Keys are flat strings so JSON diffing is + trivial. +- Quantile queries go through ``histogram_quantile`` over the standard + ``_bucket`` series. The rate window is a parameter (defaults to the + capture window, not Prometheus's default 5m) so short CI runs are usable. +- The catalogue of what to capture lives in ``regression-metrics.json`` — + this module only knows how to translate that JSON into HTTP queries. + +Usage:: + + import asyncio, aiohttp + from prom_queries import build_query_plan, run_query_plan + + plan = build_query_plan("regression-metrics.json", window="3m") + async with aiohttp.ClientSession() as s: + timings = await run_query_plan(s, "http://localhost:9090", plan) + # timings = {"span.tx.process.p99": 12.4, ...} +""" + +from __future__ import annotations + +import json +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import aiohttp + +logger = logging.getLogger("prom_queries") + + +@dataclass(frozen=True) +class QueryEntry: + """One metric to capture from Prometheus. + + Attributes: + key: Flat output key, e.g. ``span.tx.process.p99``. + promql: The PromQL expression to send to /api/v1/query. + unit: Unit of the returned value, e.g. ``ms`` or ``us``. + Baseline JSON preserves this so the comparator can + sanity-check unit drift. + """ + + key: str + promql: str + unit: str + + +def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[QueryEntry]: + """Translate regression-metrics.json into a list of PromQL queries. + + Args: + metrics_path: Path to ``regression-metrics.json``. + window: Rate window passed to ``rate()``. For short CI runs + keep this close to the test duration so the bucket + counts are meaningful. Default 3m matches the + ``regression`` workload profile. + + Returns: + A list of ``QueryEntry`` values, one per (metric × quantile). + """ + with open(metrics_path) as f: + cfg = json.load(f) + + plan: list[QueryEntry] = [] + + span_cfg = cfg.get("spans", {}) + tmpl = span_cfg.get("_query_template", "") + unit = span_cfg.get("_unit", "ms") + for name in span_cfg.get("names", []): + for q in span_cfg.get("_quantiles", []): + expr = ( + tmpl.replace("{quantile}", _format_quantile(q)) + .replace("{name}", name) + .replace("{window}", window) + ) + plan.append( + QueryEntry( + key=f"span.{name}.p{_quantile_label(q)}", + promql=expr, + unit=unit, + ) + ) + + rpc_cfg = cfg.get("rpc_methods", {}) + tmpl = rpc_cfg.get("_query_template", "") + unit = rpc_cfg.get("_unit", "us") + for name in rpc_cfg.get("names", []): + for q in rpc_cfg.get("_quantiles", []): + expr = ( + tmpl.replace("{quantile}", _format_quantile(q)) + .replace("{name}", name) + .replace("{window}", window) + ) + plan.append( + QueryEntry( + key=f"rpc.{name}.p{_quantile_label(q)}", + promql=expr, + unit=unit, + ) + ) + + job_cfg = cfg.get("job_queue", {}) + unit = job_cfg.get("_unit", "us") + phases = job_cfg.get("_phases", ["queued", "running"]) + tmpl_map = { + "queued": job_cfg.get("_queued_template", ""), + "running": job_cfg.get("_running_template", ""), + } + for name in job_cfg.get("names", []): + for phase in phases: + tmpl = tmpl_map.get(phase, "") + if not tmpl: + continue + for q in job_cfg.get("_quantiles", []): + expr = ( + tmpl.replace("{quantile}", _format_quantile(q)) + .replace("{name}", name) + .replace("{window}", window) + ) + plan.append( + QueryEntry( + key=f"job.{name}.{phase}.p{_quantile_label(q)}", + promql=expr, + unit=unit, + ) + ) + + return plan + + +async def run_query_plan( + session: aiohttp.ClientSession, + prom_url: str, + plan: list[QueryEntry], +) -> dict[str, dict[str, Any]]: + """Execute a query plan and return a flat ``key → {value, unit}`` map. + + Queries that return no data (NaN, empty result) are still included in + the output with ``value: null`` — the comparator treats missing values + as "not yet observed" rather than as a regression. This keeps the + baseline schema stable across runs with different load levels. + + Args: + session: Shared aiohttp session. + prom_url: Base URL of Prometheus (e.g. ``http://localhost:9090``). + plan: Output of :func:`build_query_plan`. + + Returns: + Mapping from metric key to ``{"value": float|None, "unit": str}``. + """ + results: dict[str, dict[str, Any]] = {} + for entry in plan: + value = await _instant_query(session, prom_url, entry.promql) + results[entry.key] = {"value": value, "unit": entry.unit} + return results + + +async def _instant_query( + session: aiohttp.ClientSession, + prom_url: str, + promql: str, +) -> float | None: + """POST an instant query to Prometheus; return the scalar value or None. + + None is returned for NaN, empty results, or HTTP errors — every call + site treats None identically ("no data captured"). + """ + url = f"{prom_url.rstrip('/')}/api/v1/query" + try: + async with session.post(url, data={"query": promql}, timeout=30) as resp: + if resp.status != 200: + logger.warning("query HTTP %d: %s", resp.status, promql) + return None + body = await resp.json() + except (aiohttp.ClientError, TimeoutError) as exc: + logger.warning("query failed: %s — %s", promql, exc) + return None + + if body.get("status") != "success": + logger.warning("query status=%s: %s", body.get("status"), promql) + return None + + result = body.get("data", {}).get("result", []) + if not result: + return None + + raw = result[0].get("value", [None, None])[1] + if raw is None or raw in ("NaN", "+Inf", "-Inf"): + return None + try: + return float(raw) + except (TypeError, ValueError): + return None + + +def _format_quantile(q: float) -> str: + """Format a quantile for PromQL (``0.99`` → ``"0.99"``).""" + return f"{q:g}" + + +def _quantile_label(q: float) -> str: + """Format a quantile for the output key (``0.95`` → ``"95"``).""" + return str(int(round(q * 100))) diff --git a/docker/telemetry/workload/regression-metrics.json b/docker/telemetry/workload/regression-metrics.json new file mode 100644 index 0000000000..07cbd1ac0a --- /dev/null +++ b/docker/telemetry/workload/regression-metrics.json @@ -0,0 +1,34 @@ +{ + "_description": "Metric surface for the OTel-driven regression gate. Each entry names a metric, the quantiles to capture, and how to query Prometheus. The comparator compares current run against baseline-timings.json under these exact keys.", + "_key_format": "{category}.{name}.p{quantile} (e.g. span.tx.process.p99, rpc.server_info.p95, job.transaction.queued.p95)", + "spans": { + "_query_template": "histogram_quantile({quantile}, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"{name}\"}[{window}])))", + "_unit": "ms", + "_quantiles": [0.5, 0.95, 0.99], + "names": [ + "rpc.request", + "rpc.process", + "tx.process", + "tx.apply", + "ledger.build", + "ledger.validate", + "ledger.store", + "consensus.ledger_close", + "consensus.accept" + ] + }, + "rpc_methods": { + "_query_template": "histogram_quantile({quantile}, sum by (le) (rate(rippled_rpc_method_duration_us_bucket{method=\"{name}\"}[{window}])))", + "_unit": "us", + "_quantiles": [0.95, 0.99], + "names": ["server_info", "account_info", "ledger", "fee", "tx"] + }, + "job_queue": { + "_queued_template": "histogram_quantile({quantile}, sum by (le) (rate(rippled_job_queued_duration_us_bucket{job_type=\"{name}\"}[{window}])))", + "_running_template": "histogram_quantile({quantile}, sum by (le) (rate(rippled_job_running_duration_us_bucket{job_type=\"{name}\"}[{window}])))", + "_unit": "us", + "_quantiles": [0.95], + "_phases": ["queued", "running"], + "names": ["transaction", "acceptLedger"] + } +} diff --git a/docker/telemetry/workload/regression-thresholds.json b/docker/telemetry/workload/regression-thresholds.json new file mode 100644 index 0000000000..176fd87669 --- /dev/null +++ b/docker/telemetry/workload/regression-thresholds.json @@ -0,0 +1,29 @@ +{ + "_description": "Per-metric regression thresholds. A metric regresses when current - baseline exceeds BOTH the percentage and absolute bounds (AND, not OR — this tolerates small-value noise). Defaults apply unless a per-metric override exists.", + "defaults": { + "span": { + "p50": { "max_pct_increase": 15.0, "max_abs_increase_ms": 2.0 }, + "p95": { "max_pct_increase": 10.0, "max_abs_increase_ms": 3.0 }, + "p99": { "max_pct_increase": 10.0, "max_abs_increase_ms": 5.0 } + }, + "rpc_method": { + "p95": { "max_pct_increase": 10.0, "max_abs_increase_us": 3000.0 }, + "p99": { "max_pct_increase": 10.0, "max_abs_increase_us": 5000.0 } + }, + "job_queue": { + "p95": { "max_pct_increase": 15.0, "max_abs_increase_us": 5000.0 } + } + }, + "overrides": { + "span.consensus.ledger_close": { + "p50": { "max_pct_increase": 5.0, "max_abs_increase_ms": 200.0 }, + "p95": { "max_pct_increase": 5.0, "max_abs_increase_ms": 500.0 }, + "p99": { "max_pct_increase": 5.0, "max_abs_increase_ms": 1000.0 } + }, + "span.consensus.accept": { + "p50": { "max_pct_increase": 5.0, "max_abs_increase_ms": 200.0 }, + "p95": { "max_pct_increase": 5.0, "max_abs_increase_ms": 500.0 }, + "p99": { "max_pct_increase": 5.0, "max_abs_increase_ms": 1000.0 } + } + } +} diff --git a/docker/telemetry/workload/run-full-validation.sh b/docker/telemetry/workload/run-full-validation.sh index dcb24064df..72a8fe8850 100755 --- a/docker/telemetry/workload/run-full-validation.sh +++ b/docker/telemetry/workload/run-full-validation.sh @@ -7,11 +7,13 @@ # 3. Wait for consensus # 4. Run workload orchestrator (RPC load, TX submission, propagation wait) # 5. Run the telemetry validation suite -# 6. (Optional) Run the performance benchmark +# 6. Capture OTel timings and compare against committed baseline +# 7. (Optional) Run the performance overhead benchmark # # Usage: # ./run-full-validation.sh --xrpld /path/to/xrpld # ./run-full-validation.sh --xrpld /path/to/xrpld --with-benchmark +# ./run-full-validation.sh --xrpld /path/to/xrpld --skip-regression # ./run-full-validation.sh --cleanup # # Exit codes: @@ -50,8 +52,16 @@ TX_TPS=5 TX_DURATION=120 WITH_BENCHMARK=false SKIP_LOKI=false +SKIP_REGRESSION=false WORKLOAD_PROFILE="full-validation" REPORT_DIR="$WORKDIR/reports" +# Rate window handed to Prometheus `rate()` when capturing timings. Keep +# this close to the active workload duration so histogram buckets cover +# the measurement window; longer windows dilute short-lived regressions. +REGRESSION_WINDOW="${REGRESSION_WINDOW:-3m}" +BASELINE_FILE="${BASELINE_FILE:-$SCRIPT_DIR/baselines/baseline-timings.json}" +THRESHOLDS_FILE="${THRESHOLDS_FILE:-$SCRIPT_DIR/regression-thresholds.json}" +METRICS_FILE="${METRICS_FILE:-$SCRIPT_DIR/regression-metrics.json}" GENESIS_ACCOUNT="rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh" GENESIS_SEED="snoPBrXtMeMyMHUVTgbuqAfg1SUTb" @@ -70,8 +80,9 @@ usage() { echo " --tx-tps TPS Transaction submit rate (default: 5)" echo " --tx-duration SECS Transaction submit duration (default: 120)" echo " --profile NAME Workload profile (default: full-validation)" - echo " --with-benchmark Also run performance benchmarks" + echo " --with-benchmark Also run performance overhead benchmark (telemetry off vs on)" echo " --skip-loki Skip Loki log-trace correlation checks" + echo " --skip-regression Skip the OTel-baseline regression gate" echo " --cleanup Tear down everything and exit" echo " -h, --help Show this help" exit 0 @@ -88,6 +99,7 @@ while [ $# -gt 0 ]; do --profile) WORKLOAD_PROFILE="$2"; shift 2 ;; --with-benchmark) WITH_BENCHMARK=true; shift ;; --skip-loki) SKIP_LOKI=true; shift ;; + --skip-regression) SKIP_REGRESSION=true; shift ;; --cleanup) # Cleanup mode log "Cleaning up..." pkill -f "$WORKDIR" 2>/dev/null || true @@ -350,10 +362,56 @@ else fi # --------------------------------------------------------------------------- -# Step 6: (Optional) Run benchmark +# Step 6: Capture OTel timings and run the regression comparison +# --------------------------------------------------------------------------- +# This step ALWAYS captures timings (so CI always has an artifact from which +# to bootstrap/refresh the committed baseline). The comparator then either: +# - prints the paste-me JSON when the baseline is a placeholder, or +# - enforces thresholds and fails the run on regression. +# Use --skip-regression to opt out (e.g. for ad-hoc local exploration). +TIMINGS_FILE="$REPORT_DIR/timings.json" +REGRESSION_REPORT="$REPORT_DIR/regression-report.json" +REGRESSION_EXIT=0 + +if [ "$SKIP_REGRESSION" != true ]; then + log "Step 6: Capturing OTel timings from Prometheus..." + if python3 "$SCRIPT_DIR/capture_timings.py" \ + --prometheus "http://localhost:9090" \ + --metrics "$METRICS_FILE" \ + --output "$TIMINGS_FILE" \ + --window "$REGRESSION_WINDOW" \ + --profile "$WORKLOAD_PROFILE" + then + ok "Timings captured: $TIMINGS_FILE" + else + fail "Failed to capture timings — skipping regression comparison." + SKIP_REGRESSION=true + fi +fi + +if [ "$SKIP_REGRESSION" != true ]; then + log "Comparing against baseline $BASELINE_FILE..." + python3 "$SCRIPT_DIR/compare_to_baseline.py" \ + --timings "$TIMINGS_FILE" \ + --baseline "$BASELINE_FILE" \ + --thresholds "$THRESHOLDS_FILE" \ + --report "$REGRESSION_REPORT" || REGRESSION_EXIT=$? + if [ "$REGRESSION_EXIT" -eq 0 ]; then + ok "Regression gate passed (or baseline placeholder — paste JSON printed above)." + elif [ "$REGRESSION_EXIT" -eq 1 ]; then + fail "Regression detected — see $REGRESSION_REPORT" + else + fail "Regression comparator internal error (exit $REGRESSION_EXIT)" + fi +else + warn "Regression gate skipped." +fi + +# --------------------------------------------------------------------------- +# Step 7: (Optional) Run overhead benchmark # --------------------------------------------------------------------------- if [ "$WITH_BENCHMARK" = true ]; then - log "Step 6: Running performance benchmark..." + log "Step 7: Running performance benchmark..." bash "$SCRIPT_DIR/benchmark.sh" \ --xrpld "$XRPLD" \ --duration 120 \ @@ -392,4 +450,13 @@ echo " $0 --cleanup" echo "" echo "===========================================================" -exit "$VALIDATION_EXIT" +# Fail the run if EITHER validation or the regression gate failed. The +# `[ "$VAR" -gt N ]` comparison works here because exit codes are numeric. +FINAL_EXIT=0 +if [ "$VALIDATION_EXIT" -ne 0 ]; then + FINAL_EXIT="$VALIDATION_EXIT" +fi +if [ "$REGRESSION_EXIT" -ne 0 ] && [ "$FINAL_EXIT" -eq 0 ]; then + FINAL_EXIT="$REGRESSION_EXIT" +fi +exit "$FINAL_EXIT"