From 577d1f8a2107be953f008be050f7d18621e524d7 Mon Sep 17 00:00:00 2001
From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
Date: Fri, 24 Apr 2026 19:36:15 +0100
Subject: [PATCH] fix: address review findings in regression gate

- capture_timings.py: fail when captured/total ratio < 50%
  (--min-capture-ratio). Prevents silent pass on unreachable Prometheus.
- run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so
  the final exit code reflects it. Update exit code docs in header.
- compare_to_baseline.py: extract _skip_delta helper to bring
  compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound
  resolution (use explicit None check instead of `or`). Remove dead
  variable override_prefix_key.
- prom_queries.py: extract _build_simple_entries and _build_job_entries
  to bring build_query_plan under 80 lines. Fix module docstring return
  type example. Use aiohttp.ClientTimeout instead of bare int.
- telemetry-validation.yml: add set -euo pipefail to regression summary
  step; guard jq calls with -e flag and fallback; fail on missing
  baseline file; emit ::warning annotation when timings.json missing.
- baselines/README.md: document the placeholder field.
---
 .github/workflows/telemetry-validation.yml    |  19 ++-
 docker/telemetry/workload/baselines/README.md |   5 +
 docker/telemetry/workload/capture_timings.py  |  18 +++
 .../telemetry/workload/compare_to_baseline.py |  71 +++++-----
 docker/telemetry/workload/prom_queries.py     | 128 +++++++++---------
 .../telemetry/workload/run-full-validation.sh |   7 +-
 6 files changed, 139 insertions(+), 109 deletions(-)

diff --git a/.github/workflows/telemetry-validation.yml b/.github/workflows/telemetry-validation.yml
index 834da6fc4e..b44a75bac1 100644
--- a/.github/workflows/telemetry-validation.yml
+++ b/.github/workflows/telemetry-validation.yml
@@ -238,16 +238,27 @@ jobs:
       - name: Print regression summary
         if: always()
         run: |
+          set -euo pipefail
           TIMINGS="/tmp/xrpld-validation/reports/timings.json"
           REGRESSION="/tmp/xrpld-validation/reports/regression-report.json"
           BASELINE="docker/telemetry/workload/baselines/baseline-timings.json"
 
           if [ ! -f "$TIMINGS" ]; then
             echo "## Regression Gate: no timings captured" >> "$GITHUB_STEP_SUMMARY"
+            echo "::warning::capture_timings.py did not produce timings.json — regression gate was not evaluated."
             exit 0
           fi
 
-          IS_PLACEHOLDER=$(jq -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE")
+          if [ ! -f "$BASELINE" ]; then
+            echo "## Regression Gate: baseline file missing" >> "$GITHUB_STEP_SUMMARY"
+            echo "::error::baselines/baseline-timings.json not found in checkout"
+            exit 1
+          fi
+
+          IS_PLACEHOLDER=$(jq -e -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE") || {
+            echo "::error::Failed to parse baseline JSON"
+            exit 1
+          }
 
           echo "## OTel Timings Regression Gate" >> "$GITHUB_STEP_SUMMARY"
           echo "" >> "$GITHUB_STEP_SUMMARY"
@@ -263,9 +274,9 @@ jobs:
             cat "$TIMINGS" >> "$GITHUB_STEP_SUMMARY"
             echo '```' >> "$GITHUB_STEP_SUMMARY"
           elif [ -f "$REGRESSION" ]; then
-            REGR_COUNT=$(jq '.summary.regressions' "$REGRESSION")
-            IMPR_COUNT=$(jq '.summary.improvements' "$REGRESSION")
-            TOTAL=$(jq '.summary.total' "$REGRESSION")
+            REGR_COUNT=$(jq -e '.summary.regressions' "$REGRESSION") || REGR_COUNT=0
+            IMPR_COUNT=$(jq -e '.summary.improvements' "$REGRESSION") || IMPR_COUNT=0
+            TOTAL=$(jq -e '.summary.total' "$REGRESSION") || TOTAL=0
             echo "| Stat | Count |" >> "$GITHUB_STEP_SUMMARY"
             echo "|------|-------|" >> "$GITHUB_STEP_SUMMARY"
             echo "| Metrics compared | $TOTAL |" >> "$GITHUB_STEP_SUMMARY"
diff --git a/docker/telemetry/workload/baselines/README.md b/docker/telemetry/workload/baselines/README.md
index 4f12646449..515a3f561f 100644
--- a/docker/telemetry/workload/baselines/README.md
+++ b/docker/telemetry/workload/baselines/README.md
@@ -62,6 +62,11 @@ should trace back to a real CI run so variance characteristics are preserved.
 }
 ```
 
+Placeholder baselines additionally include `"placeholder": true`. The comparator
+detects this field (or an empty `metrics` object) to switch into "populate" mode
+instead of enforcing thresholds. Remove the `placeholder` key when pasting real
+captured timings.
+
 Missing metrics (value `null`) in a captured run do not count as regressions — they
 are reported separately in `regression-report.json` under `missing_in_current`.
 This keeps the gate robust when a profile doesn't exercise every span on every run.
diff --git a/docker/telemetry/workload/capture_timings.py b/docker/telemetry/workload/capture_timings.py
index 9720fe92fd..6cba372d4b 100644
--- a/docker/telemetry/workload/capture_timings.py
+++ b/docker/telemetry/workload/capture_timings.py
@@ -131,6 +131,12 @@ def main() -> int:
         default="regression",
         help="Workload profile used during capture (metadata only)",
     )
+    parser.add_argument(
+        "--min-capture-ratio",
+        type=float,
+        default=0.5,
+        help="Fail if fewer than this fraction of metrics are captured (default: 0.5)",
+    )
     parser.add_argument(
         "--verbose",
         action="store_true",
@@ -160,6 +166,18 @@ def main() -> int:
     captured = sum(1 for v in report["metrics"].values() if v["value"] is not None)
     total = len(report["metrics"])
     logger.info("Wrote %s (%d/%d metrics captured)", args.output, captured, total)
+
+    if total > 0 and (captured / total) < args.min_capture_ratio:
+        logger.error(
+            "Only %d/%d (%.0f%%) metrics captured — below the %.0f%% minimum. "
+            "Is Prometheus reachable at %s?",
+            captured,
+            total,
+            captured / total * 100,
+            args.min_capture_ratio * 100,
+            args.prometheus,
+        )
+        return 1
     return 0
 
 
diff --git a/docker/telemetry/workload/compare_to_baseline.py b/docker/telemetry/workload/compare_to_baseline.py
index 820c93e5af..ed8c261cca 100644
--- a/docker/telemetry/workload/compare_to_baseline.py
+++ b/docker/telemetry/workload/compare_to_baseline.py
@@ -134,7 +134,6 @@ def resolve_thresholds(
     if cat is None:
         return (None, None)
 
-    override_prefix_key = ".".join(parts[:-1])
     override_key = f"{category_key}.{'.'.join(parts[1:-1])}"
     overrides = thresholds.get("overrides", {})
     defaults = thresholds.get("defaults", {}).get(cat, {})
@@ -146,10 +145,36 @@ def resolve_thresholds(
         return (None, None)
 
     pct = rule.get("max_pct_increase")
-    abs_bound = rule.get("max_abs_increase_ms") or rule.get("max_abs_increase_us")
+    abs_bound = rule.get("max_abs_increase_ms")
+    if abs_bound is None:
+        abs_bound = rule.get("max_abs_increase_us")
     return (pct, abs_bound)
 
 
+def _skip_delta(
+    key: str,
+    baseline: float | None,
+    current: float | None,
+    unit: str,
+    thresholds: dict,
+    note: str,
+) -> MetricDelta:
+    """Build a MetricDelta for cases where comparison is not possible."""
+    pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
+    return MetricDelta(
+        key=key,
+        baseline=baseline,
+        current=current,
+        delta=None,
+        pct_change=None,
+        unit=unit,
+        threshold_pct=pct_threshold,
+        threshold_abs=abs_threshold,
+        regressed=False,
+        note=note,
+    )
+
+
 def compute_delta(
     key: str,
     baseline_entry: dict | None,
@@ -166,50 +191,22 @@ def compute_delta(
     current = current_entry.get("value") if current_entry else None
     unit = (baseline_entry or current_entry or {}).get("unit", "")
 
-    pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
-
     if baseline is None and current is None:
-        return MetricDelta(
-            key=key,
-            baseline=None,
-            current=None,
-            delta=None,
-            pct_change=None,
-            unit=unit,
-            threshold_pct=pct_threshold,
-            threshold_abs=abs_threshold,
-            regressed=False,
-            note="no data (neither baseline nor current)",
+        return _skip_delta(
+            key, None, None, unit, thresholds, "no data (neither baseline nor current)"
         )
 
     if baseline is None:
-        return MetricDelta(
-            key=key,
-            baseline=None,
-            current=current,
-            delta=None,
-            pct_change=None,
-            unit=unit,
-            threshold_pct=pct_threshold,
-            threshold_abs=abs_threshold,
-            regressed=False,
-            note="new metric (not in baseline)",
+        return _skip_delta(
+            key, None, current, unit, thresholds, "new metric (not in baseline)"
         )
 
     if current is None:
-        return MetricDelta(
-            key=key,
-            baseline=baseline,
-            current=None,
-            delta=None,
-            pct_change=None,
-            unit=unit,
-            threshold_pct=pct_threshold,
-            threshold_abs=abs_threshold,
-            regressed=False,
-            note="not captured in current run",
+        return _skip_delta(
+            key, baseline, None, unit, thresholds, "not captured in current run"
         )
 
+    pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
     delta = current - baseline
     pct_change = (delta / baseline * 100.0) if baseline > 0 else None
 
diff --git a/docker/telemetry/workload/prom_queries.py b/docker/telemetry/workload/prom_queries.py
index 359ebd956b..b257c61241 100644
--- a/docker/telemetry/workload/prom_queries.py
+++ b/docker/telemetry/workload/prom_queries.py
@@ -23,7 +23,7 @@ Usage::
     plan = build_query_plan("regression-metrics.json", window="3m")
     async with aiohttp.ClientSession() as s:
         timings = await run_query_plan(s, "http://localhost:9090", plan)
-    # timings = {"span.tx.process.p99": 12.4, ...}
+    # timings = {"span.tx.process.p99": {"value": 12.4, "unit": "ms"}, ...}
 """
 
 from __future__ import annotations
@@ -56,6 +56,62 @@ class QueryEntry:
     unit: str
 
 
+def _build_simple_entries(
+    cfg: dict,
+    prefix: str,
+    window: str,
+) -> list[QueryEntry]:
+    """Build QueryEntry list for a single-template category (spans, rpc)."""
+    tmpl = cfg.get("_query_template", "")
+    unit = cfg.get("_unit", "ms")
+    entries: list[QueryEntry] = []
+    for name in cfg.get("names", []):
+        for q in cfg.get("_quantiles", []):
+            expr = (
+                tmpl.replace("{quantile}", _format_quantile(q))
+                .replace("{name}", name)
+                .replace("{window}", window)
+            )
+            entries.append(
+                QueryEntry(
+                    key=f"{prefix}.{name}.p{_quantile_label(q)}",
+                    promql=expr,
+                    unit=unit,
+                )
+            )
+    return entries
+
+
+def _build_job_entries(cfg: dict, window: str) -> list[QueryEntry]:
+    """Build QueryEntry list for the job_queue category (multi-phase)."""
+    unit = cfg.get("_unit", "us")
+    phases = cfg.get("_phases", ["queued", "running"])
+    tmpl_map = {
+        "queued": cfg.get("_queued_template", ""),
+        "running": cfg.get("_running_template", ""),
+    }
+    entries: list[QueryEntry] = []
+    for name in cfg.get("names", []):
+        for phase in phases:
+            tmpl = tmpl_map.get(phase, "")
+            if not tmpl:
+                continue
+            for q in cfg.get("_quantiles", []):
+                expr = (
+                    tmpl.replace("{quantile}", _format_quantile(q))
+                    .replace("{name}", name)
+                    .replace("{window}", window)
+                )
+                entries.append(
+                    QueryEntry(
+                        key=f"job.{name}.{phase}.p{_quantile_label(q)}",
+                        promql=expr,
+                        unit=unit,
+                    )
+                )
+    return entries
+
+
 def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[QueryEntry]:
     """Translate regression-metrics.json into a list of PromQL queries.
 
@@ -73,69 +129,9 @@ def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[Query
         cfg = json.load(f)
 
     plan: list[QueryEntry] = []
-
-    span_cfg = cfg.get("spans", {})
-    tmpl = span_cfg.get("_query_template", "")
-    unit = span_cfg.get("_unit", "ms")
-    for name in span_cfg.get("names", []):
-        for q in span_cfg.get("_quantiles", []):
-            expr = (
-                tmpl.replace("{quantile}", _format_quantile(q))
-                .replace("{name}", name)
-                .replace("{window}", window)
-            )
-            plan.append(
-                QueryEntry(
-                    key=f"span.{name}.p{_quantile_label(q)}",
-                    promql=expr,
-                    unit=unit,
-                )
-            )
-
-    rpc_cfg = cfg.get("rpc_methods", {})
-    tmpl = rpc_cfg.get("_query_template", "")
-    unit = rpc_cfg.get("_unit", "us")
-    for name in rpc_cfg.get("names", []):
-        for q in rpc_cfg.get("_quantiles", []):
-            expr = (
-                tmpl.replace("{quantile}", _format_quantile(q))
-                .replace("{name}", name)
-                .replace("{window}", window)
-            )
-            plan.append(
-                QueryEntry(
-                    key=f"rpc.{name}.p{_quantile_label(q)}",
-                    promql=expr,
-                    unit=unit,
-                )
-            )
-
-    job_cfg = cfg.get("job_queue", {})
-    unit = job_cfg.get("_unit", "us")
-    phases = job_cfg.get("_phases", ["queued", "running"])
-    tmpl_map = {
-        "queued": job_cfg.get("_queued_template", ""),
-        "running": job_cfg.get("_running_template", ""),
-    }
-    for name in job_cfg.get("names", []):
-        for phase in phases:
-            tmpl = tmpl_map.get(phase, "")
-            if not tmpl:
-                continue
-            for q in job_cfg.get("_quantiles", []):
-                expr = (
-                    tmpl.replace("{quantile}", _format_quantile(q))
-                    .replace("{name}", name)
-                    .replace("{window}", window)
-                )
-                plan.append(
-                    QueryEntry(
-                        key=f"job.{name}.{phase}.p{_quantile_label(q)}",
-                        promql=expr,
-                        unit=unit,
-                    )
-                )
-
+    plan.extend(_build_simple_entries(cfg.get("spans", {}), "span", window))
+    plan.extend(_build_simple_entries(cfg.get("rpc_methods", {}), "rpc", window))
+    plan.extend(_build_job_entries(cfg.get("job_queue", {}), window))
     return plan
 
 
@@ -178,7 +174,9 @@ async def _instant_query(
     """
     url = f"{prom_url.rstrip('/')}/api/v1/query"
     try:
-        async with session.post(url, data={"query": promql}, timeout=30) as resp:
+        async with session.post(
+            url, data={"query": promql}, timeout=aiohttp.ClientTimeout(total=30)
+        ) as resp:
             if resp.status != 200:
                 logger.warning("query HTTP %d: %s", resp.status, promql)
                 return None
diff --git a/docker/telemetry/workload/run-full-validation.sh b/docker/telemetry/workload/run-full-validation.sh
index 72a8fe8850..a9a5d43685 100755
--- a/docker/telemetry/workload/run-full-validation.sh
+++ b/docker/telemetry/workload/run-full-validation.sh
@@ -17,9 +17,9 @@
 #   ./run-full-validation.sh --cleanup
 #
 # Exit codes:
-#   0 — All validation checks passed
-#   1 — One or more validation checks failed
-#   2 — Infrastructure error (cluster/stack failed to start)
+#   0 — All validation checks and the regression gate passed
+#   1 — Validation checks failed OR the regression gate detected a regression
+#   2 — Infrastructure error (cluster/stack failed to start, timing capture failed)
 
 set -euo pipefail
 
@@ -385,6 +385,7 @@ if [ "$SKIP_REGRESSION" != true ]; then
         ok "Timings captured: $TIMINGS_FILE"
     else
         fail "Failed to capture timings — skipping regression comparison."
+        REGRESSION_EXIT=2
         SKIP_REGRESSION=true
     fi
 fi