diff --git a/.github/workflows/telemetry-validation.yml b/.github/workflows/telemetry-validation.yml
index 2e64261d5f..834da6fc4e 100644
--- a/.github/workflows/telemetry-validation.yml
+++ b/.github/workflows/telemetry-validation.yml
@@ -230,6 +230,58 @@ jobs:
             fi
           fi
 
+      # Publishes captured OTel timings + regression report to the Step Summary.
+      # When the committed baseline is a placeholder, emits a fenced JSON block
+      # that can be copy-pasted directly into baselines/baseline-timings.json.
+      # When the baseline is populated, summarises the top regressions so the
+      # PR author sees the failure reason without downloading artifacts.
+      - name: Print regression summary
+        if: always()
+        run: |
+          TIMINGS="/tmp/xrpld-validation/reports/timings.json"
+          REGRESSION="/tmp/xrpld-validation/reports/regression-report.json"
+          BASELINE="docker/telemetry/workload/baselines/baseline-timings.json"
+
+          if [ ! -f "$TIMINGS" ]; then
+            echo "## Regression Gate: no timings captured" >> "$GITHUB_STEP_SUMMARY"
+            exit 0
+          fi
+
+          IS_PLACEHOLDER=$(jq -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE")
+
+          echo "## OTel Timings Regression Gate" >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+
+          if [ "$IS_PLACEHOLDER" = "true" ]; then
+            echo "### Paste into \`baselines/baseline-timings.json\`" >> "$GITHUB_STEP_SUMMARY"
+            echo "" >> "$GITHUB_STEP_SUMMARY"
+            echo "The committed baseline is a placeholder. Open a PR replacing" \
+                 "its contents with the JSON block below to activate the" \
+                 "regression gate." >> "$GITHUB_STEP_SUMMARY"
+            echo "" >> "$GITHUB_STEP_SUMMARY"
+            echo '```json' >> "$GITHUB_STEP_SUMMARY"
+            cat "$TIMINGS" >> "$GITHUB_STEP_SUMMARY"
+            echo '```' >> "$GITHUB_STEP_SUMMARY"
+          elif [ -f "$REGRESSION" ]; then
+            REGR_COUNT=$(jq '.summary.regressions' "$REGRESSION")
+            IMPR_COUNT=$(jq '.summary.improvements' "$REGRESSION")
+            TOTAL=$(jq '.summary.total' "$REGRESSION")
+            echo "| Stat | Count |" >> "$GITHUB_STEP_SUMMARY"
+            echo "|------|-------|" >> "$GITHUB_STEP_SUMMARY"
+            echo "| Metrics compared | $TOTAL |" >> "$GITHUB_STEP_SUMMARY"
+            echo "| Regressions | $REGR_COUNT |" >> "$GITHUB_STEP_SUMMARY"
+            echo "| Improvements | $IMPR_COUNT |" >> "$GITHUB_STEP_SUMMARY"
+            echo "" >> "$GITHUB_STEP_SUMMARY"
+            if [ "$REGR_COUNT" -gt 0 ]; then
+              echo "### Regressions" >> "$GITHUB_STEP_SUMMARY"
+              echo "" >> "$GITHUB_STEP_SUMMARY"
+              echo "| Metric | Baseline | Current | Δ | % | Unit |" >> "$GITHUB_STEP_SUMMARY"
+              echo "|--------|---------:|--------:|--:|--:|------|" >> "$GITHUB_STEP_SUMMARY"
+              jq -r '.metrics[] | select(.regressed) | "| \(.key) | \(.baseline) | \(.current) | \(.delta) | \(.pct_change)% | \(.unit) |"' \
+                "$REGRESSION" >> "$GITHUB_STEP_SUMMARY"
+            fi
+          fi
+
       - name: Cleanup
         if: always()
         run: |
diff --git a/OpenTelemetryPlan/06-implementation-phases.md b/OpenTelemetryPlan/06-implementation-phases.md
index 6cc15beb14..5ad344251d 100644
--- a/OpenTelemetryPlan/06-implementation-phases.md
+++ b/OpenTelemetryPlan/06-implementation-phases.md
@@ -869,13 +869,15 @@ All 17 spans, 26 metrics, 10 dashboards, 14 attribute checks, 2 hierarchies, and
 
 **Not implemented or not available in CI**:
 
-1. Performance benchmark suite (Task 10.5) — not started
-2. `rpc.request` -> `rpc.process` parent-child hierarchy — skipped (cross-thread context propagation)
-3. Log-trace correlation validation (Loki) — not included in checks
-4. Full 255+ StatsD metric coverage — only 26 representative metrics validated
-5. Sustained load / backpressure testing — not implemented
-6. `docs/telemetry-runbook.md` updates — not done
-7. `09-data-collection-reference.md` "Validation" section — not done
+1. `rpc.request` -> `rpc.process` parent-child hierarchy — skipped (cross-thread context propagation)
+2. Log-trace correlation validation (Loki) — not included in checks
+3. Full 255+ StatsD metric coverage — only 26 representative metrics validated
+4. Sustained load / backpressure testing — not implemented
+5. `docs/telemetry-runbook.md` updates — not done
+6. `09-data-collection-reference.md` "Validation" section — not done
+7. **Automated cross-CI baseline persistence** — the regression gate reads a
+   committed baseline; baseline updates flow through a manual PR refresh, not
+   an artifact promoted from `develop` (FU-2).
 
 ### Exit Criteria
 
@@ -884,6 +886,8 @@ All 17 spans, 26 metrics, 10 dashboards, 14 attribute checks, 2 hierarchies, and
 - [x] All 10 Grafana dashboards render data
 - [ ] Benchmark shows < 3% CPU overhead, < 5MB memory overhead
 - [x] CI workflow runs validation on telemetry branch changes
+- [x] OTel-driven regression gate: captures per-span/per-RPC/per-job timings
+      from Prometheus and compares against a committed baseline
 
 ---
 
@@ -1240,19 +1244,19 @@ Clear, measurable criteria for each phase.
 
 ### 6.12.6 Success Metrics Summary
 
-| Phase    | Primary Metric                   | Secondary Metric            | Deadline       | Status             |
-| -------- | -------------------------------- | --------------------------- | -------------- | ------------------ |
-| Phase 1  | SDK compiles and runs            | Zero overhead when disabled | End of Week 2  | Active             |
-| Phase 2  | 100% RPC coverage                | <1ms latency overhead       | End of Week 4  | Active             |
-| Phase 3  | Cross-node traces work           | <5% throughput impact       | End of Week 6  | Active             |
-| Phase 4  | Consensus fully traced           | No consensus timing impact  | End of Week 8  | Active             |
-| Phase 5  | Production deployment            | Operators trained           | End of Week 9  | Active             |
-| Phase 6  | StatsD metrics in Prometheus     | 3 dashboards operational    | End of Week 10 | Active             |
-| Phase 7  | All metrics via OTLP             | No StatsD dependency        | End of Week 12 | Active             |
-| Phase 8  | trace_id in logs + Loki          | Tempo↔Loki correlation      | End of Week 13 | Active             |
-| Phase 9  | 68+ new internal metrics in Prom | 2 new dashboards            | End of Week 15 | Future Enhancement |
-| Phase 10 | Full telemetry stack validated   | < 3% CPU overhead proven    | End of Week 17 | Future Enhancement |
-| Phase 11 | Third-party metrics via receiver | 4 new dashboards + alerting | End of Week 20 | Future Enhancement |
+| Phase    | Primary Metric                                                     | Secondary Metric            | Deadline       | Status             |
+| -------- | ------------------------------------------------------------------ | --------------------------- | -------------- | ------------------ |
+| Phase 1  | SDK compiles and runs                                              | Zero overhead when disabled | End of Week 2  | Active             |
+| Phase 2  | 100% RPC coverage                                                  | <1ms latency overhead       | End of Week 4  | Active             |
+| Phase 3  | Cross-node traces work                                             | <5% throughput impact       | End of Week 6  | Active             |
+| Phase 4  | Consensus fully traced                                             | No consensus timing impact  | End of Week 8  | Active             |
+| Phase 5  | Production deployment                                              | Operators trained           | End of Week 9  | Active             |
+| Phase 6  | StatsD metrics in Prometheus                                       | 3 dashboards operational    | End of Week 10 | Active             |
+| Phase 7  | All metrics via OTLP                                               | No StatsD dependency        | End of Week 12 | Active             |
+| Phase 8  | trace_id in logs + Loki                                            | Tempo↔Loki correlation      | End of Week 13 | Active             |
+| Phase 9  | 68+ new internal metrics in Prom                                   | 2 new dashboards            | End of Week 15 | Future Enhancement |
+| Phase 10 | Full telemetry stack validated; OTel-sourced regression gate in CI | < 3% CPU overhead proven    | End of Week 17 | Future Enhancement |
+| Phase 11 | Third-party metrics via receiver                                   | 4 new dashboards + alerting | End of Week 20 | Future Enhancement |
 
 ---
 
diff --git a/OpenTelemetryPlan/Phase10_taskList.md b/OpenTelemetryPlan/Phase10_taskList.md
index f93985964a..d9b0779505 100644
--- a/OpenTelemetryPlan/Phase10_taskList.md
+++ b/OpenTelemetryPlan/Phase10_taskList.md
@@ -229,14 +229,27 @@ Before Phases 1-9 can be considered production-ready, we need proof that:
 
 ---
 
-## Exit Criteria
+## Exit Criteria — Delivered in PR #6519
 
-- [ ] 5-node validator cluster starts and reaches consensus in docker-compose
-- [ ] RPC load generator fires all traced RPC commands at configurable rates
-- [ ] Transaction submitter generates 6+ transaction types at configurable TPS
-- [ ] Validation suite confirms all 16 spans, 22 attributes, 300+ metrics are present
-- [ ] Log-trace correlation validated end-to-end (Loki ↔ Tempo)
-- [ ] All 10 Grafana dashboards render data (no empty panels)
-- [ ] Benchmark shows < 3% CPU overhead, < 5MB memory overhead
-- [ ] CI workflow runs validation on telemetry branch changes
-- [ ] Validation report output is CI-parseable (JSON with exit codes)
+- [x] Multi-node validator cluster starts and reaches consensus
+- [x] RPC load generator fires all traced RPC commands at configurable rates
+- [x] Transaction submitter generates 6+ transaction types at configurable TPS
+- [x] Validation suite confirms all required spans, attributes, and metrics
+- [x] Log-trace correlation validated end-to-end (Loki ↔ Tempo)
+- [x] Grafana dashboards render data (no empty panels)
+- [x] Overhead benchmark (`benchmark.sh`) measures telemetry-off vs telemetry-on deltas
+- [x] CI workflow runs validation on telemetry branch changes
+- [x] Validation report output is CI-parseable (JSON with exit codes)
+- [x] OTel-driven regression gate captures per-span/per-RPC/per-job timings from
+      Prometheus and compares against a committed baseline
+
+## Follow-up Work (tracked in separate PRs)
+
+- [ ] FU-2: Automate baseline persistence across CI runs (artifact uploaded
+      on merge to `develop`, downloaded on PR runs). Current mechanism
+      requires a manual baseline-refresh PR.
+- [ ] FU-4: Replace the proxy measurements in `benchmark.sh` (wall-clock curl
+      p99, ledger-cadence-as-TPS, ledger-cadence-as-consensus-p95) with
+      PromQL quantile queries from the same pipeline the regression gate uses.
+- [ ] FU-6: Grafana dashboard plotting historical baseline values keyed by
+      commit SHA, for triaging noisy regressions.
diff --git a/docker/telemetry/workload/README.md b/docker/telemetry/workload/README.md
index a827834310..f1aa1d2720 100644
--- a/docker/telemetry/workload/README.md
+++ b/docker/telemetry/workload/README.md
@@ -213,6 +213,49 @@ python3 validate_telemetry.py --report /tmp/report.json
 python3 validate_telemetry.py --skip-loki --report /tmp/report.json
 ```
 
+### OTel Timings Regression Gate
+
+`capture_timings.py` + `compare_to_baseline.py` implement a regression gate
+that compares OTel-derived per-span/per-RPC/per-job timings against a
+committed baseline. Unlike `benchmark.sh` (which measures the overhead of
+enabling telemetry on the current binary), this gate catches **xrpld
+performance regressions over time** by diffing against a stored baseline
+from a prior run.
+
+How it runs inside the validation pipeline:
+
+1. `run-full-validation.sh` executes the normal workload and validation suite.
+2. After validation, `capture_timings.py` queries Prometheus for every
+   metric in `regression-metrics.json` and writes `reports/timings.json`.
+3. `compare_to_baseline.py` reads `timings.json`,
+   `baselines/baseline-timings.json`, and `regression-thresholds.json`,
+   then either:
+   - Prints the paste-me JSON block (when the baseline is a placeholder
+     or empty) and exits 0.
+   - Prints a delta table, writes `reports/regression-report.json`, and
+     exits non-zero if any metric breached both the percentage AND
+     absolute bound.
+
+Bootstrapping a baseline:
+
+1. Push the branch. The `Telemetry Validation` CI run prints the full
+   timings JSON under "Paste into `baselines/baseline-timings.json`" in
+   the workflow Step Summary.
+2. Open a PR copying that JSON block verbatim into
+   `baselines/baseline-timings.json`. Reviewer approval is the audit gate.
+3. Subsequent runs compare against it; the gate fails on regression.
+
+Per-run tuning:
+
+- `--skip-regression` disables the gate (local exploration only).
+- `REGRESSION_WINDOW` env var overrides the default Prometheus `rate()`
+  window (`3m`). Keep close to the workload duration.
+- Metric surface lives in `regression-metrics.json`; thresholds in
+  `regression-thresholds.json`; both are reviewed changes.
+
+See [`baselines/README.md`](./baselines/README.md) for the baseline
+lifecycle and refresh process.
+
 ### benchmark.sh
 
 Compares baseline (no telemetry) vs telemetry-enabled performance:
@@ -273,13 +316,16 @@ The validation runs as a GitHub Actions workflow (`.github/workflows/telemetry-v
 
 ## Configuration Files
 
-| File                     | Purpose                                                       |
-| ------------------------ | ------------------------------------------------------------- |
-| `workload-profiles.json` | Named load profiles with phase definitions                    |
-| `expected_spans.json`    | Span inventory (names, attributes, hierarchies, config flags) |
-| `expected_metrics.json`  | Metric inventory — every listed metric must be present        |
-| `test_accounts.json`     | Test account roles (keys generated at runtime)                |
-| `requirements.txt`       | Python dependencies                                           |
+| File                              | Purpose                                                       |
+| --------------------------------- | ------------------------------------------------------------- |
+| `workload-profiles.json`          | Named load profiles with phase definitions                    |
+| `expected_spans.json`             | Span inventory (names, attributes, hierarchies, config flags) |
+| `expected_metrics.json`           | Metric inventory — every listed metric must be present        |
+| `test_accounts.json`              | Test account roles (keys generated at runtime)                |
+| `regression-metrics.json`         | Metric surface for the OTel regression gate                   |
+| `regression-thresholds.json`      | Per-metric regression bounds (pct AND abs)                    |
+| `baselines/baseline-timings.json` | Committed baseline — populated from first CI run              |
+| `requirements.txt`                | Python dependencies                                           |
 
 ### expected_metrics.json Format
 
diff --git a/docker/telemetry/workload/baselines/README.md b/docker/telemetry/workload/baselines/README.md
new file mode 100644
index 0000000000..4f12646449
--- /dev/null
+++ b/docker/telemetry/workload/baselines/README.md
@@ -0,0 +1,67 @@
+# Performance Baselines
+
+This directory holds the committed baseline file used by the OTel-driven regression gate.
+
+## How the gate works
+
+After the validation suite runs, `capture_timings.py` queries Prometheus for the timings
+declared in [`../regression-metrics.json`](../regression-metrics.json) and writes a
+`timings.json`. Then `compare_to_baseline.py` reads [`baseline-timings.json`](./baseline-timings.json),
+[`../regression-thresholds.json`](../regression-thresholds.json), and the captured
+`timings.json`. The comparator picks one of two modes automatically:
+
+- **Placeholder baseline** (`"placeholder": true` or empty `metrics`): the comparator
+  prints the captured timings JSON in exactly the format expected for this file, then
+  exits 0 without gating. This is how we bootstrap the baseline.
+- **Populated baseline**: the comparator diffs per-metric, enforces the thresholds
+  (regression = current exceeds baseline on BOTH the percentage AND absolute bound),
+  and exits non-zero on any regression.
+
+The regression gate runs against whatever workload profile `run-full-validation.sh`
+was invoked with. Capture and comparison are profile-agnostic — they only read
+Prometheus — so all existing profiles (`full-validation`, `quick-smoke`, `stress`)
+continue to work unchanged.
+
+## Bootstrapping the baseline
+
+1. Merge a CI run with a `"placeholder": true` baseline. The telemetry-validation
+   workflow runs, fails no gate, and prints the captured timings block to the workflow
+   Step Summary under the heading `### Paste into baselines/baseline-timings.json`.
+2. Open a new PR. Copy the full JSON block from the Step Summary (or download the
+   `timings.json` artifact) into this file, replacing the placeholder contents. The
+   JSON is emitted in the exact byte-for-byte format this file expects — sorted keys,
+   2-space indent, trailing newline.
+3. The committed baseline PR needs reviewer approval just like any other code change.
+   This is the primary audit point for "who moved the performance bar."
+
+## Refreshing the baseline
+
+Refresh when a legitimate performance change lands on `develop` (for example, a
+deliberate rewrite that changes a span's structure). The process is identical to
+bootstrapping: run CI with the current baseline, inspect the delta, and if the
+new numbers should become the norm, open a PR pasting the fresh timings into
+`baseline-timings.json`. The reviewer decides whether the new baseline is acceptable.
+
+Do **not** edit `baseline-timings.json` by hand outside of this process — every entry
+should trace back to a real CI run so variance characteristics are preserved.
+
+## Schema
+
+```json
+{
+  "schema_version": 1,
+  "captured_at": "2026-04-24T17:30:00Z",
+  "window": "3m",
+  "git_sha": "<SHA of the commit that produced these numbers>",
+  "profile": "<workload profile used>",
+  "metrics": {
+    "span.tx.process.p99": { "value": 12.4, "unit": "ms" },
+    "rpc.server_info.p95": { "value": 850.0, "unit": "us" },
+    "job.transaction.queued.p95": { "value": 1500.0, "unit": "us" }
+  }
+}
+```
+
+Missing metrics (value `null`) in a captured run do not count as regressions — they
+are reported separately in `regression-report.json` under `missing_in_current`.
+This keeps the gate robust when a profile doesn't exercise every span on every run.
diff --git a/docker/telemetry/workload/baselines/baseline-timings.json b/docker/telemetry/workload/baselines/baseline-timings.json
new file mode 100644
index 0000000000..9fe3c1f6ad
--- /dev/null
+++ b/docker/telemetry/workload/baselines/baseline-timings.json
@@ -0,0 +1,10 @@
+{
+  "_comment": "Empty baseline placeholder. The first CI run of the regression gate will emit the captured timings JSON to the workflow step summary; copy that JSON over this file (in a PR) to activate the regression gate. See baselines/README.md.",
+  "placeholder": true,
+  "schema_version": 1,
+  "captured_at": null,
+  "window": null,
+  "git_sha": null,
+  "profile": null,
+  "metrics": {}
+}
diff --git a/docker/telemetry/workload/capture_timings.py b/docker/telemetry/workload/capture_timings.py
new file mode 100644
index 0000000000..9720fe92fd
--- /dev/null
+++ b/docker/telemetry/workload/capture_timings.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""Capture OTel-derived timings from Prometheus for the regression gate.
+
+Queries Prometheus for every metric declared in ``regression-metrics.json``
+and writes the results to a JSON file in the exact schema
+``baseline-timings.json`` expects. When a user wants to refresh the
+baseline, they copy a CI run's ``timings.json`` artifact (or the block
+printed to the workflow step summary) into
+``baselines/baseline-timings.json`` in a reviewable PR.
+
+Output schema (stable — ``compare_to_baseline.py`` reads it verbatim)::
+
+    {
+        "schema_version": 1,
+        "captured_at": "2026-04-24T17:30:00Z",
+        "window": "3m",
+        "git_sha": "<from $GITHUB_SHA or `git rev-parse HEAD`>",
+        "profile": "regression",
+        "metrics": {
+            "span.tx.process.p99": {"value": 12.4, "unit": "ms"},
+            "rpc.server_info.p95": {"value": 850.0, "unit": "us"},
+            ...
+        }
+    }
+
+Usage::
+
+    python3 capture_timings.py \\
+        --prometheus http://localhost:9090 \\
+        --metrics regression-metrics.json \\
+        --output /tmp/timings.json \\
+        --window 3m \\
+        --profile regression
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import aiohttp
+
+from prom_queries import build_query_plan, run_query_plan
+
+logger = logging.getLogger("capture_timings")
+
+SCHEMA_VERSION = 1
+
+
+async def capture(
+    prom_url: str,
+    metrics_path: Path,
+    window: str,
+    profile: str,
+) -> dict:
+    """Build and execute the query plan, return the full report dict."""
+    plan = build_query_plan(metrics_path, window=window)
+    logger.info("Capturing %d metrics from %s (window=%s)", len(plan), prom_url, window)
+
+    async with aiohttp.ClientSession() as session:
+        metrics = await run_query_plan(session, prom_url, plan)
+
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "captured_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "window": window,
+        "git_sha": _detect_git_sha(),
+        "profile": profile,
+        "metrics": dict(sorted(metrics.items())),
+    }
+
+
+def _detect_git_sha() -> str:
+    """Return the current commit SHA from env or git, else ``"unknown"``.
+
+    Prefers ``GITHUB_SHA`` (set in Actions), falls back to ``git rev-parse``.
+    Silent fallback is fine here — a missing SHA only affects the captured
+    metadata, not the comparison logic.
+    """
+    env_sha = os.environ.get("GITHUB_SHA")
+    if env_sha:
+        return env_sha
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "HEAD"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+            check=False,
+        )
+        if result.returncode == 0:
+            return result.stdout.strip()
+    except (OSError, subprocess.SubprocessError):
+        pass
+    return "unknown"
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--prometheus",
+        default="http://localhost:9090",
+        help="Prometheus base URL (default: http://localhost:9090)",
+    )
+    parser.add_argument(
+        "--metrics",
+        type=Path,
+        default=Path(__file__).parent / "regression-metrics.json",
+        help="Path to regression-metrics.json",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        required=True,
+        help="Where to write the captured timings JSON",
+    )
+    parser.add_argument(
+        "--window",
+        default="3m",
+        help="Prometheus rate() window (default: 3m)",
+    )
+    parser.add_argument(
+        "--profile",
+        default="regression",
+        help="Workload profile used during capture (metadata only)",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable debug logging",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(levelname)s %(name)s: %(message)s",
+    )
+
+    report = asyncio.run(
+        capture(
+            prom_url=args.prometheus,
+            metrics_path=args.metrics,
+            window=args.window,
+            profile=args.profile,
+        )
+    )
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(report, f, indent=2, sort_keys=True)
+        f.write("\n")
+
+    captured = sum(1 for v in report["metrics"].values() if v["value"] is not None)
+    total = len(report["metrics"])
+    logger.info("Wrote %s (%d/%d metrics captured)", args.output, captured, total)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/docker/telemetry/workload/compare_to_baseline.py b/docker/telemetry/workload/compare_to_baseline.py
new file mode 100644
index 0000000000..820c93e5af
--- /dev/null
+++ b/docker/telemetry/workload/compare_to_baseline.py
@@ -0,0 +1,404 @@
+#!/usr/bin/env python3
+"""Compare captured OTel timings against a committed baseline.
+
+Operating modes (chosen automatically based on the baseline file contents):
+
+1. **No baseline** — if ``baseline-timings.json`` has an empty
+   ``metrics`` object (or is marked with ``"placeholder": true``), this
+   script is in "populate" mode. It prints the captured timings JSON in
+   the exact format expected for pasting into
+   ``baselines/baseline-timings.json``, then exits 0. No regression check.
+
+2. **Populated baseline** — per-metric percentage AND absolute deltas are
+   computed against thresholds from ``regression-thresholds.json``. A
+   regression occurs when BOTH bounds are breached for the same quantile.
+   Prints a human-readable table and writes a full JSON report.
+   Exits 1 if any regression was detected, else 0.
+
+Inputs:
+    --timings     Captured timings JSON (from capture_timings.py)
+    --baseline    Committed baseline JSON
+    --thresholds  Threshold policy JSON
+    --report      Where to write regression-report.json (optional)
+
+Exit codes:
+    0 — No baseline (paste-me emitted), OR baseline populated and no regression
+    1 — Regression detected (at least one metric breached both bounds)
+    2 — Internal error (e.g. bad JSON, baseline/current key mismatch)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import sys
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger("compare_to_baseline")
+
+
+@dataclass
+class MetricDelta:
+    """Single metric's baseline-vs-current comparison outcome.
+
+    Attributes:
+        key:                Flat metric key (e.g. span.tx.process.p99).
+        baseline:           Baseline value (may be None if unpopulated).
+        current:            Current run value (may be None if not captured).
+        delta:              current - baseline (None if either side None).
+        pct_change:         100 * delta / baseline (None if baseline ≤ 0).
+        unit:               Unit from baseline (preserved as-is).
+        threshold_pct:      Resolved per-metric pct threshold.
+        threshold_abs:      Resolved per-metric absolute threshold.
+        regressed:          True iff both bounds breached.
+        note:               Human-readable classification when not regressed.
+    """
+
+    key: str
+    baseline: float | None
+    current: float | None
+    delta: float | None
+    pct_change: float | None
+    unit: str
+    threshold_pct: float | None
+    threshold_abs: float | None
+    regressed: bool
+    note: str
+
+
+def load_json(path: Path) -> dict:
+    with open(path) as f:
+        return json.load(f)
+
+
+def is_placeholder(baseline: dict) -> bool:
+    """A baseline is a placeholder if explicitly marked OR metrics are empty."""
+    if baseline.get("placeholder") is True:
+        return True
+    return not baseline.get("metrics")
+
+
+def print_paste_me(timings: dict) -> None:
+    """Print captured timings in the exact baseline-timings.json format.
+
+    The output between the two banner lines is the file contents to paste,
+    byte-for-byte — sorted keys, 2-space indent, trailing newline.
+    """
+    banner = "=" * 72
+    print(banner, file=sys.stderr)
+    print(
+        "  NO BASELINE FOUND — paste the JSON below into",
+        file=sys.stderr,
+    )
+    print(
+        "  docker/telemetry/workload/baselines/baseline-timings.json",
+        file=sys.stderr,
+    )
+    print(banner, file=sys.stderr)
+
+    print(json.dumps(timings, indent=2, sort_keys=True))
+
+    print(banner, file=sys.stderr)
+    print(
+        "  (End of paste-me JSON. Gate did NOT run — baseline is empty.)",
+        file=sys.stderr,
+    )
+    print(banner, file=sys.stderr)
+
+
+def resolve_thresholds(
+    key: str,
+    thresholds: dict,
+) -> tuple[float | None, float | None]:
+    """Return ``(pct_threshold, abs_threshold)`` for a metric key.
+
+    Per-metric overrides win over defaults. Returns ``(None, None)`` if no
+    threshold is defined for this category/quantile — such metrics are
+    captured but never gate the build.
+    """
+    parts = key.split(".")
+    if len(parts) < 3:
+        return (None, None)
+    category_key = parts[0]
+    quantile_key = parts[-1]
+
+    category_map = {
+        "span": "span",
+        "rpc": "rpc_method",
+        "job": "job_queue",
+    }
+    cat = category_map.get(category_key)
+    if cat is None:
+        return (None, None)
+
+    override_prefix_key = ".".join(parts[:-1])
+    override_key = f"{category_key}.{'.'.join(parts[1:-1])}"
+    overrides = thresholds.get("overrides", {})
+    defaults = thresholds.get("defaults", {}).get(cat, {})
+
+    rule = overrides.get(override_key, {}).get(quantile_key)
+    if rule is None:
+        rule = defaults.get(quantile_key)
+    if rule is None:
+        return (None, None)
+
+    pct = rule.get("max_pct_increase")
+    abs_bound = rule.get("max_abs_increase_ms") or rule.get("max_abs_increase_us")
+    return (pct, abs_bound)
+
+
+def compute_delta(
+    key: str,
+    baseline_entry: dict | None,
+    current_entry: dict | None,
+    thresholds: dict,
+) -> MetricDelta:
+    """Compute a MetricDelta for one metric key.
+
+    A regression requires BOTH bounds to be breached simultaneously. This
+    tolerates small-value noise: a 100% increase on a 0.5 ms metric
+    (to 1.0 ms) is not a regression under a 5 ms absolute bound.
+    """
+    baseline = baseline_entry.get("value") if baseline_entry else None
+    current = current_entry.get("value") if current_entry else None
+    unit = (baseline_entry or current_entry or {}).get("unit", "")
+
+    pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
+
+    if baseline is None and current is None:
+        return MetricDelta(
+            key=key,
+            baseline=None,
+            current=None,
+            delta=None,
+            pct_change=None,
+            unit=unit,
+            threshold_pct=pct_threshold,
+            threshold_abs=abs_threshold,
+            regressed=False,
+            note="no data (neither baseline nor current)",
+        )
+
+    if baseline is None:
+        return MetricDelta(
+            key=key,
+            baseline=None,
+            current=current,
+            delta=None,
+            pct_change=None,
+            unit=unit,
+            threshold_pct=pct_threshold,
+            threshold_abs=abs_threshold,
+            regressed=False,
+            note="new metric (not in baseline)",
+        )
+
+    if current is None:
+        return MetricDelta(
+            key=key,
+            baseline=baseline,
+            current=None,
+            delta=None,
+            pct_change=None,
+            unit=unit,
+            threshold_pct=pct_threshold,
+            threshold_abs=abs_threshold,
+            regressed=False,
+            note="not captured in current run",
+        )
+
+    delta = current - baseline
+    pct_change = (delta / baseline * 100.0) if baseline > 0 else None
+
+    if pct_threshold is None or abs_threshold is None:
+        return MetricDelta(
+            key=key,
+            baseline=baseline,
+            current=current,
+            delta=delta,
+            pct_change=pct_change,
+            unit=unit,
+            threshold_pct=pct_threshold,
+            threshold_abs=abs_threshold,
+            regressed=False,
+            note="no threshold configured",
+        )
+
+    pct_breach = pct_change is not None and pct_change > pct_threshold
+    abs_breach = delta > abs_threshold
+    regressed = pct_breach and abs_breach
+
+    if regressed:
+        note = "REGRESSION"
+    elif delta < 0:
+        note = "improved"
+    else:
+        note = "within bounds"
+
+    return MetricDelta(
+        key=key,
+        baseline=baseline,
+        current=current,
+        delta=delta,
+        pct_change=pct_change,
+        unit=unit,
+        threshold_pct=pct_threshold,
+        threshold_abs=abs_threshold,
+        regressed=regressed,
+        note=note,
+    )
+
+
+def print_summary(deltas: list[MetricDelta]) -> None:
+    """Print a sorted, human-readable table of per-metric results."""
+    regressions = [d for d in deltas if d.regressed]
+    improvements = [
+        d
+        for d in deltas
+        if d.delta is not None and d.delta < 0 and d.baseline not in (None, 0)
+    ]
+    improvements.sort(key=lambda d: d.pct_change or 0)
+    regressions.sort(key=lambda d: -(d.pct_change or 0))
+
+    print("=" * 72)
+    print(f"  Regression check: {len(regressions)} regression(s) detected")
+    print("=" * 72)
+
+    if regressions:
+        print("\nRegressions (breached BOTH pct AND absolute bounds):")
+        _print_table(regressions)
+
+    if improvements:
+        top = improvements[:5]
+        print("\nTop improvements:")
+        _print_table(top)
+
+    missing = [d for d in deltas if d.note == "not captured in current run"]
+    if missing:
+        print(f"\n{len(missing)} baseline metric(s) not captured in current run:")
+        for d in missing:
+            print(f"  {d.key}")
+
+
+def _print_table(rows: list[MetricDelta]) -> None:
+    """Print a fixed-width table for a list of deltas."""
+    header = f"  {'METRIC':<45} {'BASE':>10} {'CUR':>10} {'Δ':>10} {'%':>8} UNIT"
+    print(header)
+    print("  " + "-" * (len(header) - 2))
+    for d in rows:
+        base = f"{d.baseline:.2f}" if d.baseline is not None else "-"
+        cur = f"{d.current:.2f}" if d.current is not None else "-"
+        delta = f"{d.delta:+.2f}" if d.delta is not None else "-"
+        pct = f"{d.pct_change:+.1f}%" if d.pct_change is not None else "-"
+        print(f"  {d.key:<45} {base:>10} {cur:>10} {delta:>10} {pct:>8} {d.unit}")
+
+
+def write_report(
+    deltas: list[MetricDelta],
+    report_path: Path,
+    baseline: dict,
+    timings: dict,
+) -> None:
+    """Write regression-report.json — machine-readable artifact for CI."""
+    regressions = [d for d in deltas if d.regressed]
+    payload = {
+        "schema_version": 1,
+        "baseline_captured_at": baseline.get("captured_at"),
+        "baseline_git_sha": baseline.get("git_sha"),
+        "current_captured_at": timings.get("captured_at"),
+        "current_git_sha": timings.get("git_sha"),
+        "window": timings.get("window"),
+        "profile": timings.get("profile"),
+        "summary": {
+            "total": len(deltas),
+            "regressions": len(regressions),
+            "improvements": sum(
+                1
+                for d in deltas
+                if d.delta is not None and d.delta < 0 and d.baseline not in (None, 0)
+            ),
+            "missing_in_current": sum(
+                1 for d in deltas if d.note == "not captured in current run"
+            ),
+        },
+        "metrics": [asdict(d) for d in deltas],
+    }
+    report_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(report_path, "w") as f:
+        json.dump(payload, f, indent=2, sort_keys=True)
+        f.write("\n")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--timings",
+        type=Path,
+        required=True,
+        help="Captured timings JSON (from capture_timings.py)",
+    )
+    parser.add_argument(
+        "--baseline",
+        type=Path,
+        required=True,
+        help="Committed baseline-timings.json",
+    )
+    parser.add_argument(
+        "--thresholds",
+        type=Path,
+        default=Path(__file__).parent / "regression-thresholds.json",
+        help="Threshold policy JSON",
+    )
+    parser.add_argument(
+        "--report",
+        type=Path,
+        default=None,
+        help="Where to write regression-report.json (optional)",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(levelname)s %(name)s: %(message)s",
+    )
+
+    try:
+        timings = load_json(args.timings)
+        baseline = load_json(args.baseline)
+        thresholds = load_json(args.thresholds)
+    except (OSError, json.JSONDecodeError) as exc:
+        logger.error("failed to load inputs: %s", exc)
+        return 2
+
+    if is_placeholder(baseline):
+        print_paste_me(timings)
+        return 0
+
+    baseline_metrics = baseline.get("metrics", {})
+    current_metrics = timings.get("metrics", {})
+
+    all_keys = sorted(set(baseline_metrics) | set(current_metrics))
+    deltas = [
+        compute_delta(
+            key,
+            baseline_metrics.get(key),
+            current_metrics.get(key),
+            thresholds,
+        )
+        for key in all_keys
+    ]
+
+    print_summary(deltas)
+
+    if args.report:
+        write_report(deltas, args.report, baseline, timings)
+        logger.info("wrote %s", args.report)
+
+    return 1 if any(d.regressed for d in deltas) else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/docker/telemetry/workload/prom_queries.py b/docker/telemetry/workload/prom_queries.py
new file mode 100644
index 0000000000..359ebd956b
--- /dev/null
+++ b/docker/telemetry/workload/prom_queries.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+"""Shared Prometheus query helpers for the regression gate.
+
+Single source of truth for how regression metrics are computed. Both
+``capture_timings.py`` and any future tooling consume this module so metric
+name → PromQL expression stays consistent.
+
+Design:
+- Every captured metric has a key in the form ``{category}.{name}.p{quantile}``
+  (e.g. ``span.tx.process.p99``). Keys are flat strings so JSON diffing is
+  trivial.
+- Quantile queries go through ``histogram_quantile`` over the standard
+  ``_bucket`` series. The rate window is a parameter (defaults to the
+  capture window, not Prometheus's default 5m) so short CI runs are usable.
+- The catalogue of what to capture lives in ``regression-metrics.json`` —
+  this module only knows how to translate that JSON into HTTP queries.
+
+Usage::
+
+    import asyncio, aiohttp
+    from prom_queries import build_query_plan, run_query_plan
+
+    plan = build_query_plan("regression-metrics.json", window="3m")
+    async with aiohttp.ClientSession() as s:
+        timings = await run_query_plan(s, "http://localhost:9090", plan)
+    # timings = {"span.tx.process.p99": 12.4, ...}
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import aiohttp
+
+logger = logging.getLogger("prom_queries")
+
+
+@dataclass(frozen=True)
+class QueryEntry:
+    """One metric to capture from Prometheus.
+
+    Attributes:
+        key:      Flat output key, e.g. ``span.tx.process.p99``.
+        promql:   The PromQL expression to send to /api/v1/query.
+        unit:     Unit of the returned value, e.g. ``ms`` or ``us``.
+                  Baseline JSON preserves this so the comparator can
+                  sanity-check unit drift.
+    """
+
+    key: str
+    promql: str
+    unit: str
+
+
+def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[QueryEntry]:
+    """Translate regression-metrics.json into a list of PromQL queries.
+
+    Args:
+        metrics_path: Path to ``regression-metrics.json``.
+        window:       Rate window passed to ``rate()``. For short CI runs
+                      keep this close to the test duration so the bucket
+                      counts are meaningful. Default 3m matches the
+                      ``regression`` workload profile.
+
+    Returns:
+        A list of ``QueryEntry`` values, one per (metric × quantile).
+    """
+    with open(metrics_path) as f:
+        cfg = json.load(f)
+
+    plan: list[QueryEntry] = []
+
+    span_cfg = cfg.get("spans", {})
+    tmpl = span_cfg.get("_query_template", "")
+    unit = span_cfg.get("_unit", "ms")
+    for name in span_cfg.get("names", []):
+        for q in span_cfg.get("_quantiles", []):
+            expr = (
+                tmpl.replace("{quantile}", _format_quantile(q))
+                .replace("{name}", name)
+                .replace("{window}", window)
+            )
+            plan.append(
+                QueryEntry(
+                    key=f"span.{name}.p{_quantile_label(q)}",
+                    promql=expr,
+                    unit=unit,
+                )
+            )
+
+    rpc_cfg = cfg.get("rpc_methods", {})
+    tmpl = rpc_cfg.get("_query_template", "")
+    unit = rpc_cfg.get("_unit", "us")
+    for name in rpc_cfg.get("names", []):
+        for q in rpc_cfg.get("_quantiles", []):
+            expr = (
+                tmpl.replace("{quantile}", _format_quantile(q))
+                .replace("{name}", name)
+                .replace("{window}", window)
+            )
+            plan.append(
+                QueryEntry(
+                    key=f"rpc.{name}.p{_quantile_label(q)}",
+                    promql=expr,
+                    unit=unit,
+                )
+            )
+
+    job_cfg = cfg.get("job_queue", {})
+    unit = job_cfg.get("_unit", "us")
+    phases = job_cfg.get("_phases", ["queued", "running"])
+    tmpl_map = {
+        "queued": job_cfg.get("_queued_template", ""),
+        "running": job_cfg.get("_running_template", ""),
+    }
+    for name in job_cfg.get("names", []):
+        for phase in phases:
+            tmpl = tmpl_map.get(phase, "")
+            if not tmpl:
+                continue
+            for q in job_cfg.get("_quantiles", []):
+                expr = (
+                    tmpl.replace("{quantile}", _format_quantile(q))
+                    .replace("{name}", name)
+                    .replace("{window}", window)
+                )
+                plan.append(
+                    QueryEntry(
+                        key=f"job.{name}.{phase}.p{_quantile_label(q)}",
+                        promql=expr,
+                        unit=unit,
+                    )
+                )
+
+    return plan
+
+
+async def run_query_plan(
+    session: aiohttp.ClientSession,
+    prom_url: str,
+    plan: list[QueryEntry],
+) -> dict[str, dict[str, Any]]:
+    """Execute a query plan and return a flat ``key → {value, unit}`` map.
+
+    Queries that return no data (NaN, empty result) are still included in
+    the output with ``value: null`` — the comparator treats missing values
+    as "not yet observed" rather than as a regression. This keeps the
+    baseline schema stable across runs with different load levels.
+
+    Args:
+        session:  Shared aiohttp session.
+        prom_url: Base URL of Prometheus (e.g. ``http://localhost:9090``).
+        plan:     Output of :func:`build_query_plan`.
+
+    Returns:
+        Mapping from metric key to ``{"value": float|None, "unit": str}``.
+    """
+    results: dict[str, dict[str, Any]] = {}
+    for entry in plan:
+        value = await _instant_query(session, prom_url, entry.promql)
+        results[entry.key] = {"value": value, "unit": entry.unit}
+    return results
+
+
+async def _instant_query(
+    session: aiohttp.ClientSession,
+    prom_url: str,
+    promql: str,
+) -> float | None:
+    """POST an instant query to Prometheus; return the scalar value or None.
+
+    None is returned for NaN, empty results, or HTTP errors — every call
+    site treats None identically ("no data captured").
+    """
+    url = f"{prom_url.rstrip('/')}/api/v1/query"
+    try:
+        async with session.post(url, data={"query": promql}, timeout=30) as resp:
+            if resp.status != 200:
+                logger.warning("query HTTP %d: %s", resp.status, promql)
+                return None
+            body = await resp.json()
+    except (aiohttp.ClientError, TimeoutError) as exc:
+        logger.warning("query failed: %s — %s", promql, exc)
+        return None
+
+    if body.get("status") != "success":
+        logger.warning("query status=%s: %s", body.get("status"), promql)
+        return None
+
+    result = body.get("data", {}).get("result", [])
+    if not result:
+        return None
+
+    raw = result[0].get("value", [None, None])[1]
+    if raw is None or raw in ("NaN", "+Inf", "-Inf"):
+        return None
+    try:
+        return float(raw)
+    except (TypeError, ValueError):
+        return None
+
+
+def _format_quantile(q: float) -> str:
+    """Format a quantile for PromQL (``0.99`` → ``"0.99"``)."""
+    return f"{q:g}"
+
+
+def _quantile_label(q: float) -> str:
+    """Format a quantile for the output key (``0.95`` → ``"95"``)."""
+    return str(int(round(q * 100)))
diff --git a/docker/telemetry/workload/regression-metrics.json b/docker/telemetry/workload/regression-metrics.json
new file mode 100644
index 0000000000..07cbd1ac0a
--- /dev/null
+++ b/docker/telemetry/workload/regression-metrics.json
@@ -0,0 +1,34 @@
+{
+  "_description": "Metric surface for the OTel-driven regression gate. Each entry names a metric, the quantiles to capture, and how to query Prometheus. The comparator compares current run against baseline-timings.json under these exact keys.",
+  "_key_format": "{category}.{name}.p{quantile}  (e.g. span.tx.process.p99, rpc.server_info.p95, job.transaction.queued.p95)",
+  "spans": {
+    "_query_template": "histogram_quantile({quantile}, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_name=\"{name}\"}[{window}])))",
+    "_unit": "ms",
+    "_quantiles": [0.5, 0.95, 0.99],
+    "names": [
+      "rpc.request",
+      "rpc.process",
+      "tx.process",
+      "tx.apply",
+      "ledger.build",
+      "ledger.validate",
+      "ledger.store",
+      "consensus.ledger_close",
+      "consensus.accept"
+    ]
+  },
+  "rpc_methods": {
+    "_query_template": "histogram_quantile({quantile}, sum by (le) (rate(rippled_rpc_method_duration_us_bucket{method=\"{name}\"}[{window}])))",
+    "_unit": "us",
+    "_quantiles": [0.95, 0.99],
+    "names": ["server_info", "account_info", "ledger", "fee", "tx"]
+  },
+  "job_queue": {
+    "_queued_template": "histogram_quantile({quantile}, sum by (le) (rate(rippled_job_queued_duration_us_bucket{job_type=\"{name}\"}[{window}])))",
+    "_running_template": "histogram_quantile({quantile}, sum by (le) (rate(rippled_job_running_duration_us_bucket{job_type=\"{name}\"}[{window}])))",
+    "_unit": "us",
+    "_quantiles": [0.95],
+    "_phases": ["queued", "running"],
+    "names": ["transaction", "acceptLedger"]
+  }
+}
diff --git a/docker/telemetry/workload/regression-thresholds.json b/docker/telemetry/workload/regression-thresholds.json
new file mode 100644
index 0000000000..176fd87669
--- /dev/null
+++ b/docker/telemetry/workload/regression-thresholds.json
@@ -0,0 +1,29 @@
+{
+  "_description": "Per-metric regression thresholds. A metric regresses when current - baseline exceeds BOTH the percentage and absolute bounds (AND, not OR — this tolerates small-value noise). Defaults apply unless a per-metric override exists.",
+  "defaults": {
+    "span": {
+      "p50": { "max_pct_increase": 15.0, "max_abs_increase_ms": 2.0 },
+      "p95": { "max_pct_increase": 10.0, "max_abs_increase_ms": 3.0 },
+      "p99": { "max_pct_increase": 10.0, "max_abs_increase_ms": 5.0 }
+    },
+    "rpc_method": {
+      "p95": { "max_pct_increase": 10.0, "max_abs_increase_us": 3000.0 },
+      "p99": { "max_pct_increase": 10.0, "max_abs_increase_us": 5000.0 }
+    },
+    "job_queue": {
+      "p95": { "max_pct_increase": 15.0, "max_abs_increase_us": 5000.0 }
+    }
+  },
+  "overrides": {
+    "span.consensus.ledger_close": {
+      "p50": { "max_pct_increase": 5.0, "max_abs_increase_ms": 200.0 },
+      "p95": { "max_pct_increase": 5.0, "max_abs_increase_ms": 500.0 },
+      "p99": { "max_pct_increase": 5.0, "max_abs_increase_ms": 1000.0 }
+    },
+    "span.consensus.accept": {
+      "p50": { "max_pct_increase": 5.0, "max_abs_increase_ms": 200.0 },
+      "p95": { "max_pct_increase": 5.0, "max_abs_increase_ms": 500.0 },
+      "p99": { "max_pct_increase": 5.0, "max_abs_increase_ms": 1000.0 }
+    }
+  }
+}
diff --git a/docker/telemetry/workload/run-full-validation.sh b/docker/telemetry/workload/run-full-validation.sh
index dcb24064df..72a8fe8850 100755
--- a/docker/telemetry/workload/run-full-validation.sh
+++ b/docker/telemetry/workload/run-full-validation.sh
@@ -7,11 +7,13 @@
 #   3. Wait for consensus
 #   4. Run workload orchestrator (RPC load, TX submission, propagation wait)
 #   5. Run the telemetry validation suite
-#   6. (Optional) Run the performance benchmark
+#   6. Capture OTel timings and compare against committed baseline
+#   7. (Optional) Run the performance overhead benchmark
 #
 # Usage:
 #   ./run-full-validation.sh --xrpld /path/to/xrpld
 #   ./run-full-validation.sh --xrpld /path/to/xrpld --with-benchmark
+#   ./run-full-validation.sh --xrpld /path/to/xrpld --skip-regression
 #   ./run-full-validation.sh --cleanup
 #
 # Exit codes:
@@ -50,8 +52,16 @@ TX_TPS=5
 TX_DURATION=120
 WITH_BENCHMARK=false
 SKIP_LOKI=false
+SKIP_REGRESSION=false
 WORKLOAD_PROFILE="full-validation"
 REPORT_DIR="$WORKDIR/reports"
+# Rate window handed to Prometheus `rate()` when capturing timings. Keep
+# this close to the active workload duration so histogram buckets cover
+# the measurement window; longer windows dilute short-lived regressions.
+REGRESSION_WINDOW="${REGRESSION_WINDOW:-3m}"
+BASELINE_FILE="${BASELINE_FILE:-$SCRIPT_DIR/baselines/baseline-timings.json}"
+THRESHOLDS_FILE="${THRESHOLDS_FILE:-$SCRIPT_DIR/regression-thresholds.json}"
+METRICS_FILE="${METRICS_FILE:-$SCRIPT_DIR/regression-metrics.json}"
 
 GENESIS_ACCOUNT="rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh"
 GENESIS_SEED="snoPBrXtMeMyMHUVTgbuqAfg1SUTb"
@@ -70,8 +80,9 @@ usage() {
     echo "  --tx-tps TPS         Transaction submit rate (default: 5)"
     echo "  --tx-duration SECS   Transaction submit duration (default: 120)"
     echo "  --profile NAME       Workload profile (default: full-validation)"
-    echo "  --with-benchmark     Also run performance benchmarks"
+    echo "  --with-benchmark     Also run performance overhead benchmark (telemetry off vs on)"
     echo "  --skip-loki          Skip Loki log-trace correlation checks"
+    echo "  --skip-regression    Skip the OTel-baseline regression gate"
     echo "  --cleanup            Tear down everything and exit"
     echo "  -h, --help           Show this help"
     exit 0
@@ -88,6 +99,7 @@ while [ $# -gt 0 ]; do
         --profile)       WORKLOAD_PROFILE="$2"; shift 2 ;;
         --with-benchmark) WITH_BENCHMARK=true; shift ;;
         --skip-loki)     SKIP_LOKI=true; shift ;;
+        --skip-regression) SKIP_REGRESSION=true; shift ;;
         --cleanup)       # Cleanup mode
             log "Cleaning up..."
             pkill -f "$WORKDIR" 2>/dev/null || true
@@ -350,10 +362,56 @@ else
 fi
 
 # ---------------------------------------------------------------------------
-# Step 6: (Optional) Run benchmark
+# Step 6: Capture OTel timings and run the regression comparison
+# ---------------------------------------------------------------------------
+# This step ALWAYS captures timings (so CI always has an artifact from which
+# to bootstrap/refresh the committed baseline). The comparator then either:
+#   - prints the paste-me JSON when the baseline is a placeholder, or
+#   - enforces thresholds and fails the run on regression.
+# Use --skip-regression to opt out (e.g. for ad-hoc local exploration).
+TIMINGS_FILE="$REPORT_DIR/timings.json"
+REGRESSION_REPORT="$REPORT_DIR/regression-report.json"
+REGRESSION_EXIT=0
+
+if [ "$SKIP_REGRESSION" != true ]; then
+    log "Step 6: Capturing OTel timings from Prometheus..."
+    if python3 "$SCRIPT_DIR/capture_timings.py" \
+        --prometheus "http://localhost:9090" \
+        --metrics "$METRICS_FILE" \
+        --output "$TIMINGS_FILE" \
+        --window "$REGRESSION_WINDOW" \
+        --profile "$WORKLOAD_PROFILE"
+    then
+        ok "Timings captured: $TIMINGS_FILE"
+    else
+        fail "Failed to capture timings — skipping regression comparison."
+        SKIP_REGRESSION=true
+    fi
+fi
+
+if [ "$SKIP_REGRESSION" != true ]; then
+    log "Comparing against baseline $BASELINE_FILE..."
+    python3 "$SCRIPT_DIR/compare_to_baseline.py" \
+        --timings "$TIMINGS_FILE" \
+        --baseline "$BASELINE_FILE" \
+        --thresholds "$THRESHOLDS_FILE" \
+        --report "$REGRESSION_REPORT" || REGRESSION_EXIT=$?
+    if [ "$REGRESSION_EXIT" -eq 0 ]; then
+        ok "Regression gate passed (or baseline placeholder — paste JSON printed above)."
+    elif [ "$REGRESSION_EXIT" -eq 1 ]; then
+        fail "Regression detected — see $REGRESSION_REPORT"
+    else
+        fail "Regression comparator internal error (exit $REGRESSION_EXIT)"
+    fi
+else
+    warn "Regression gate skipped."
+fi
+
+# ---------------------------------------------------------------------------
+# Step 7: (Optional) Run overhead benchmark
 # ---------------------------------------------------------------------------
 if [ "$WITH_BENCHMARK" = true ]; then
-    log "Step 6: Running performance benchmark..."
+    log "Step 7: Running performance benchmark..."
     bash "$SCRIPT_DIR/benchmark.sh" \
         --xrpld "$XRPLD" \
         --duration 120 \
@@ -392,4 +450,13 @@ echo "    $0 --cleanup"
 echo ""
 echo "==========================================================="
 
-exit "$VALIDATION_EXIT"
+# Fail the run if EITHER validation or the regression gate failed. The
+# `[ "$VAR" -gt N ]` comparison works here because exit codes are numeric.
+FINAL_EXIT=0
+if [ "$VALIDATION_EXIT" -ne 0 ]; then
+    FINAL_EXIT="$VALIDATION_EXIT"
+fi
+if [ "$REGRESSION_EXIT" -ne 0 ] && [ "$FINAL_EXIT" -eq 0 ]; then
+    FINAL_EXIT="$REGRESSION_EXIT"
+fi
+exit "$FINAL_EXIT"