fix: address review findings in regression gate

- capture_timings.py: fail when captured/total ratio < 50% (--min-capture-ratio). Prevents silent pass on unreachable Prometheus. - run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so the final exit code reflects it. Update exit code docs in header. - compare_to_baseline.py: extract _skip_delta helper to bring compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound resolution (use explicit None check instead of `or`). Remove dead variable override_prefix_key. - prom_queries.py: extract _build_simple_entries and _build_job_entries to bring build_query_plan under 80 lines. Fix module docstring return type example. Use aiohttp.ClientTimeout instead of bare int. - telemetry-validation.yml: add set -euo pipefail to regression summary step; guard jq calls with -e flag and fallback; fail on missing baseline file; emit ::warning annotation when timings.json missing. - baselines/README.md: document the placeholder field.
2026-07-24 15:40:26 +00:00 · 2026-04-24 19:36:15 +01:00
parent df79d5e74b
commit 577d1f8a21
6 changed files with 139 additions and 109 deletions
--- a/.github/workflows/telemetry-validation.yml
+++ b/.github/workflows/telemetry-validation.yml
@@ -238,16 +238,27 @@ jobs:
      - name: Print regression summary
        if: always()
        run: |
+          set -euo pipefail
          TIMINGS="/tmp/xrpld-validation/reports/timings.json"
          REGRESSION="/tmp/xrpld-validation/reports/regression-report.json"
          BASELINE="docker/telemetry/workload/baselines/baseline-timings.json"

          if [ ! -f "$TIMINGS" ]; then
            echo "## Regression Gate: no timings captured" >> "$GITHUB_STEP_SUMMARY"
+            echo "::warning::capture_timings.py did not produce timings.json — regression gate was not evaluated."
            exit 0
          fi

-          IS_PLACEHOLDER=$(jq -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE")
+          if [ ! -f "$BASELINE" ]; then
+            echo "## Regression Gate: baseline file missing" >> "$GITHUB_STEP_SUMMARY"
+            echo "::error::baselines/baseline-timings.json not found in checkout"
+            exit 1
+          fi
+
+          IS_PLACEHOLDER=$(jq -e -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE") || {
+            echo "::error::Failed to parse baseline JSON"
+            exit 1
+          }

          echo "## OTel Timings Regression Gate" >> "$GITHUB_STEP_SUMMARY"
          echo "" >> "$GITHUB_STEP_SUMMARY"
@@ -263,9 +274,9 @@ jobs:
            cat "$TIMINGS" >> "$GITHUB_STEP_SUMMARY"
            echo '```' >> "$GITHUB_STEP_SUMMARY"
          elif [ -f "$REGRESSION" ]; then
-            REGR_COUNT=$(jq '.summary.regressions' "$REGRESSION")
-            IMPR_COUNT=$(jq '.summary.improvements' "$REGRESSION")
-            TOTAL=$(jq '.summary.total' "$REGRESSION")
+            REGR_COUNT=$(jq -e '.summary.regressions' "$REGRESSION") || REGR_COUNT=0
+            IMPR_COUNT=$(jq -e '.summary.improvements' "$REGRESSION") || IMPR_COUNT=0
+            TOTAL=$(jq -e '.summary.total' "$REGRESSION") || TOTAL=0
            echo "| Stat | Count |" >> "$GITHUB_STEP_SUMMARY"
            echo "|------|-------|" >> "$GITHUB_STEP_SUMMARY"
            echo "| Metrics compared | $TOTAL |" >> "$GITHUB_STEP_SUMMARY"
--- a/docker/telemetry/workload/baselines/README.md
+++ b/docker/telemetry/workload/baselines/README.md
@@ -62,6 +62,11 @@ should trace back to a real CI run so variance characteristics are preserved.
 }
 ```

+Placeholder baselines additionally include `"placeholder": true`. The comparator
+detects this field (or an empty `metrics` object) to switch into "populate" mode
+instead of enforcing thresholds. Remove the `placeholder` key when pasting real
+captured timings.
+
 Missing metrics (value `null`) in a captured run do not count as regressions — they
 are reported separately in `regression-report.json` under `missing_in_current`.
 This keeps the gate robust when a profile doesn't exercise every span on every run.
--- a/docker/telemetry/workload/capture_timings.py
+++ b/docker/telemetry/workload/capture_timings.py
@@ -131,6 +131,12 @@ def main() -> int:
        default="regression",
        help="Workload profile used during capture (metadata only)",
    )
+    parser.add_argument(
+        "--min-capture-ratio",
+        type=float,
+        default=0.5,
+        help="Fail if fewer than this fraction of metrics are captured (default: 0.5)",
+    )
    parser.add_argument(
        "--verbose",
        action="store_true",
@@ -160,6 +166,18 @@ def main() -> int:
    captured = sum(1 for v in report["metrics"].values() if v["value"] is not None)
    total = len(report["metrics"])
    logger.info("Wrote %s (%d/%d metrics captured)", args.output, captured, total)
+
+    if total > 0 and (captured / total) < args.min_capture_ratio:
+        logger.error(
+            "Only %d/%d (%.0f%%) metrics captured — below the %.0f%% minimum. "
+            "Is Prometheus reachable at %s?",
+            captured,
+            total,
+            captured / total * 100,
+            args.min_capture_ratio * 100,
+            args.prometheus,
+        )
+        return 1
    return 0


--- a/docker/telemetry/workload/compare_to_baseline.py
+++ b/docker/telemetry/workload/compare_to_baseline.py
@@ -134,7 +134,6 @@ def resolve_thresholds(
    if cat is None:
        return (None, None)

-    override_prefix_key = ".".join(parts[:-1])
    override_key = f"{category_key}.{'.'.join(parts[1:-1])}"
    overrides = thresholds.get("overrides", {})
    defaults = thresholds.get("defaults", {}).get(cat, {})
@@ -146,10 +145,36 @@ def resolve_thresholds(
        return (None, None)

    pct = rule.get("max_pct_increase")
-    abs_bound = rule.get("max_abs_increase_ms") or rule.get("max_abs_increase_us")
+    abs_bound = rule.get("max_abs_increase_ms")
+    if abs_bound is None:
+        abs_bound = rule.get("max_abs_increase_us")
    return (pct, abs_bound)


+def _skip_delta(
+    key: str,
+    baseline: float | None,
+    current: float | None,
+    unit: str,
+    thresholds: dict,
+    note: str,
+) -> MetricDelta:
+    """Build a MetricDelta for cases where comparison is not possible."""
+    pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
+    return MetricDelta(
+        key=key,
+        baseline=baseline,
+        current=current,
+        delta=None,
+        pct_change=None,
+        unit=unit,
+        threshold_pct=pct_threshold,
+        threshold_abs=abs_threshold,
+        regressed=False,
+        note=note,
+    )
+
+
 def compute_delta(
    key: str,
    baseline_entry: dict | None,
@@ -166,50 +191,22 @@ def compute_delta(
    current = current_entry.get("value") if current_entry else None
    unit = (baseline_entry or current_entry or {}).get("unit", "")

-    pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
-
    if baseline is None and current is None:
-        return MetricDelta(
-            key=key,
-            baseline=None,
-            current=None,
-            delta=None,
-            pct_change=None,
-            unit=unit,
-            threshold_pct=pct_threshold,
-            threshold_abs=abs_threshold,
-            regressed=False,
-            note="no data (neither baseline nor current)",
+        return _skip_delta(
+            key, None, None, unit, thresholds, "no data (neither baseline nor current)"
        )

    if baseline is None:
-        return MetricDelta(
-            key=key,
-            baseline=None,
-            current=current,
-            delta=None,
-            pct_change=None,
-            unit=unit,
-            threshold_pct=pct_threshold,
-            threshold_abs=abs_threshold,
-            regressed=False,
-            note="new metric (not in baseline)",
+        return _skip_delta(
+            key, None, current, unit, thresholds, "new metric (not in baseline)"
        )

    if current is None:
-        return MetricDelta(
-            key=key,
-            baseline=baseline,
-            current=None,
-            delta=None,
-            pct_change=None,
-            unit=unit,
-            threshold_pct=pct_threshold,
-            threshold_abs=abs_threshold,
-            regressed=False,
-            note="not captured in current run",
+        return _skip_delta(
+            key, baseline, None, unit, thresholds, "not captured in current run"
        )

+    pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
    delta = current - baseline
    pct_change = (delta / baseline * 100.0) if baseline > 0 else None

--- a/docker/telemetry/workload/prom_queries.py
+++ b/docker/telemetry/workload/prom_queries.py
@@ -23,7 +23,7 @@ Usage::
    plan = build_query_plan("regression-metrics.json", window="3m")
    async with aiohttp.ClientSession() as s:
        timings = await run_query_plan(s, "http://localhost:9090", plan)
-    # timings = {"span.tx.process.p99": 12.4, ...}
+    # timings = {"span.tx.process.p99": {"value": 12.4, "unit": "ms"}, ...}
 """

 from __future__ import annotations
@@ -56,6 +56,62 @@ class QueryEntry:
    unit: str


+def _build_simple_entries(
+    cfg: dict,
+    prefix: str,
+    window: str,
+) -> list[QueryEntry]:
+    """Build QueryEntry list for a single-template category (spans, rpc)."""
+    tmpl = cfg.get("_query_template", "")
+    unit = cfg.get("_unit", "ms")
+    entries: list[QueryEntry] = []
+    for name in cfg.get("names", []):
+        for q in cfg.get("_quantiles", []):
+            expr = (
+                tmpl.replace("{quantile}", _format_quantile(q))
+                .replace("{name}", name)
+                .replace("{window}", window)
+            )
+            entries.append(
+                QueryEntry(
+                    key=f"{prefix}.{name}.p{_quantile_label(q)}",
+                    promql=expr,
+                    unit=unit,
+                )
+            )
+    return entries
+
+
+def _build_job_entries(cfg: dict, window: str) -> list[QueryEntry]:
+    """Build QueryEntry list for the job_queue category (multi-phase)."""
+    unit = cfg.get("_unit", "us")
+    phases = cfg.get("_phases", ["queued", "running"])
+    tmpl_map = {
+        "queued": cfg.get("_queued_template", ""),
+        "running": cfg.get("_running_template", ""),
+    }
+    entries: list[QueryEntry] = []
+    for name in cfg.get("names", []):
+        for phase in phases:
+            tmpl = tmpl_map.get(phase, "")
+            if not tmpl:
+                continue
+            for q in cfg.get("_quantiles", []):
+                expr = (
+                    tmpl.replace("{quantile}", _format_quantile(q))
+                    .replace("{name}", name)
+                    .replace("{window}", window)
+                )
+                entries.append(
+                    QueryEntry(
+                        key=f"job.{name}.{phase}.p{_quantile_label(q)}",
+                        promql=expr,
+                        unit=unit,
+                    )
+                )
+    return entries
+
+
 def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[QueryEntry]:
    """Translate regression-metrics.json into a list of PromQL queries.

@@ -73,69 +129,9 @@ def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[Query
        cfg = json.load(f)

    plan: list[QueryEntry] = []
-
-    span_cfg = cfg.get("spans", {})
-    tmpl = span_cfg.get("_query_template", "")
-    unit = span_cfg.get("_unit", "ms")
-    for name in span_cfg.get("names", []):
-        for q in span_cfg.get("_quantiles", []):
-            expr = (
-                tmpl.replace("{quantile}", _format_quantile(q))
-                .replace("{name}", name)
-                .replace("{window}", window)
-            )
-            plan.append(
-                QueryEntry(
-                    key=f"span.{name}.p{_quantile_label(q)}",
-                    promql=expr,
-                    unit=unit,
-                )
-            )
-
-    rpc_cfg = cfg.get("rpc_methods", {})
-    tmpl = rpc_cfg.get("_query_template", "")
-    unit = rpc_cfg.get("_unit", "us")
-    for name in rpc_cfg.get("names", []):
-        for q in rpc_cfg.get("_quantiles", []):
-            expr = (
-                tmpl.replace("{quantile}", _format_quantile(q))
-                .replace("{name}", name)
-                .replace("{window}", window)
-            )
-            plan.append(
-                QueryEntry(
-                    key=f"rpc.{name}.p{_quantile_label(q)}",
-                    promql=expr,
-                    unit=unit,
-                )
-            )
-
-    job_cfg = cfg.get("job_queue", {})
-    unit = job_cfg.get("_unit", "us")
-    phases = job_cfg.get("_phases", ["queued", "running"])
-    tmpl_map = {
-        "queued": job_cfg.get("_queued_template", ""),
-        "running": job_cfg.get("_running_template", ""),
-    }
-    for name in job_cfg.get("names", []):
-        for phase in phases:
-            tmpl = tmpl_map.get(phase, "")
-            if not tmpl:
-                continue
-            for q in job_cfg.get("_quantiles", []):
-                expr = (
-                    tmpl.replace("{quantile}", _format_quantile(q))
-                    .replace("{name}", name)
-                    .replace("{window}", window)
-                )
-                plan.append(
-                    QueryEntry(
-                        key=f"job.{name}.{phase}.p{_quantile_label(q)}",
-                        promql=expr,
-                        unit=unit,
-                    )
-                )
-
+    plan.extend(_build_simple_entries(cfg.get("spans", {}), "span", window))
+    plan.extend(_build_simple_entries(cfg.get("rpc_methods", {}), "rpc", window))
+    plan.extend(_build_job_entries(cfg.get("job_queue", {}), window))
    return plan


@@ -178,7 +174,9 @@ async def _instant_query(
    """
    url = f"{prom_url.rstrip('/')}/api/v1/query"
    try:
-        async with session.post(url, data={"query": promql}, timeout=30) as resp:
+        async with session.post(
+            url, data={"query": promql}, timeout=aiohttp.ClientTimeout(total=30)
+        ) as resp:
            if resp.status != 200:
                logger.warning("query HTTP %d: %s", resp.status, promql)
                return None
--- a/docker/telemetry/workload/run-full-validation.sh
+++ b/docker/telemetry/workload/run-full-validation.sh
@@ -17,9 +17,9 @@
 #   ./run-full-validation.sh --cleanup
 #
 # Exit codes:
-#   0 — All validation checks passed
-#   1 — One or more validation checks failed
-#   2 — Infrastructure error (cluster/stack failed to start)
+#   0 — All validation checks and the regression gate passed
+#   1 — Validation checks failed OR the regression gate detected a regression
+#   2 — Infrastructure error (cluster/stack failed to start, timing capture failed)

 set -euo pipefail

@@ -385,6 +385,7 @@ if [ "$SKIP_REGRESSION" != true ]; then
        ok "Timings captured: $TIMINGS_FILE"
    else
        fail "Failed to capture timings — skipping regression comparison."
+        REGRESSION_EXIT=2
        SKIP_REGRESSION=true
    fi
 fi