From 577d1f8a2107be953f008be050f7d18621e524d7 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 24 Apr 2026 19:36:15 +0100 Subject: [PATCH] fix: address review findings in regression gate - capture_timings.py: fail when captured/total ratio < 50% (--min-capture-ratio). Prevents silent pass on unreachable Prometheus. - run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so the final exit code reflects it. Update exit code docs in header. - compare_to_baseline.py: extract _skip_delta helper to bring compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound resolution (use explicit None check instead of `or`). Remove dead variable override_prefix_key. - prom_queries.py: extract _build_simple_entries and _build_job_entries to bring build_query_plan under 80 lines. Fix module docstring return type example. Use aiohttp.ClientTimeout instead of bare int. - telemetry-validation.yml: add set -euo pipefail to regression summary step; guard jq calls with -e flag and fallback; fail on missing baseline file; emit ::warning annotation when timings.json missing. - baselines/README.md: document the placeholder field. --- .github/workflows/telemetry-validation.yml | 19 ++- docker/telemetry/workload/baselines/README.md | 5 + docker/telemetry/workload/capture_timings.py | 18 +++ .../telemetry/workload/compare_to_baseline.py | 71 +++++----- docker/telemetry/workload/prom_queries.py | 128 +++++++++--------- .../telemetry/workload/run-full-validation.sh | 7 +- 6 files changed, 139 insertions(+), 109 deletions(-) diff --git a/.github/workflows/telemetry-validation.yml b/.github/workflows/telemetry-validation.yml index 834da6fc4e..b44a75bac1 100644 --- a/.github/workflows/telemetry-validation.yml +++ b/.github/workflows/telemetry-validation.yml @@ -238,16 +238,27 @@ jobs: - name: Print regression summary if: always() run: | + set -euo pipefail TIMINGS="/tmp/xrpld-validation/reports/timings.json" REGRESSION="/tmp/xrpld-validation/reports/regression-report.json" BASELINE="docker/telemetry/workload/baselines/baseline-timings.json" if [ ! -f "$TIMINGS" ]; then echo "## Regression Gate: no timings captured" >> "$GITHUB_STEP_SUMMARY" + echo "::warning::capture_timings.py did not produce timings.json — regression gate was not evaluated." exit 0 fi - IS_PLACEHOLDER=$(jq -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE") + if [ ! -f "$BASELINE" ]; then + echo "## Regression Gate: baseline file missing" >> "$GITHUB_STEP_SUMMARY" + echo "::error::baselines/baseline-timings.json not found in checkout" + exit 1 + fi + + IS_PLACEHOLDER=$(jq -e -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE") || { + echo "::error::Failed to parse baseline JSON" + exit 1 + } echo "## OTel Timings Regression Gate" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" @@ -263,9 +274,9 @@ jobs: cat "$TIMINGS" >> "$GITHUB_STEP_SUMMARY" echo '```' >> "$GITHUB_STEP_SUMMARY" elif [ -f "$REGRESSION" ]; then - REGR_COUNT=$(jq '.summary.regressions' "$REGRESSION") - IMPR_COUNT=$(jq '.summary.improvements' "$REGRESSION") - TOTAL=$(jq '.summary.total' "$REGRESSION") + REGR_COUNT=$(jq -e '.summary.regressions' "$REGRESSION") || REGR_COUNT=0 + IMPR_COUNT=$(jq -e '.summary.improvements' "$REGRESSION") || IMPR_COUNT=0 + TOTAL=$(jq -e '.summary.total' "$REGRESSION") || TOTAL=0 echo "| Stat | Count |" >> "$GITHUB_STEP_SUMMARY" echo "|------|-------|" >> "$GITHUB_STEP_SUMMARY" echo "| Metrics compared | $TOTAL |" >> "$GITHUB_STEP_SUMMARY" diff --git a/docker/telemetry/workload/baselines/README.md b/docker/telemetry/workload/baselines/README.md index 4f12646449..515a3f561f 100644 --- a/docker/telemetry/workload/baselines/README.md +++ b/docker/telemetry/workload/baselines/README.md @@ -62,6 +62,11 @@ should trace back to a real CI run so variance characteristics are preserved. } ``` +Placeholder baselines additionally include `"placeholder": true`. The comparator +detects this field (or an empty `metrics` object) to switch into "populate" mode +instead of enforcing thresholds. Remove the `placeholder` key when pasting real +captured timings. + Missing metrics (value `null`) in a captured run do not count as regressions — they are reported separately in `regression-report.json` under `missing_in_current`. This keeps the gate robust when a profile doesn't exercise every span on every run. diff --git a/docker/telemetry/workload/capture_timings.py b/docker/telemetry/workload/capture_timings.py index 9720fe92fd..6cba372d4b 100644 --- a/docker/telemetry/workload/capture_timings.py +++ b/docker/telemetry/workload/capture_timings.py @@ -131,6 +131,12 @@ def main() -> int: default="regression", help="Workload profile used during capture (metadata only)", ) + parser.add_argument( + "--min-capture-ratio", + type=float, + default=0.5, + help="Fail if fewer than this fraction of metrics are captured (default: 0.5)", + ) parser.add_argument( "--verbose", action="store_true", @@ -160,6 +166,18 @@ def main() -> int: captured = sum(1 for v in report["metrics"].values() if v["value"] is not None) total = len(report["metrics"]) logger.info("Wrote %s (%d/%d metrics captured)", args.output, captured, total) + + if total > 0 and (captured / total) < args.min_capture_ratio: + logger.error( + "Only %d/%d (%.0f%%) metrics captured — below the %.0f%% minimum. " + "Is Prometheus reachable at %s?", + captured, + total, + captured / total * 100, + args.min_capture_ratio * 100, + args.prometheus, + ) + return 1 return 0 diff --git a/docker/telemetry/workload/compare_to_baseline.py b/docker/telemetry/workload/compare_to_baseline.py index 820c93e5af..ed8c261cca 100644 --- a/docker/telemetry/workload/compare_to_baseline.py +++ b/docker/telemetry/workload/compare_to_baseline.py @@ -134,7 +134,6 @@ def resolve_thresholds( if cat is None: return (None, None) - override_prefix_key = ".".join(parts[:-1]) override_key = f"{category_key}.{'.'.join(parts[1:-1])}" overrides = thresholds.get("overrides", {}) defaults = thresholds.get("defaults", {}).get(cat, {}) @@ -146,10 +145,36 @@ def resolve_thresholds( return (None, None) pct = rule.get("max_pct_increase") - abs_bound = rule.get("max_abs_increase_ms") or rule.get("max_abs_increase_us") + abs_bound = rule.get("max_abs_increase_ms") + if abs_bound is None: + abs_bound = rule.get("max_abs_increase_us") return (pct, abs_bound) +def _skip_delta( + key: str, + baseline: float | None, + current: float | None, + unit: str, + thresholds: dict, + note: str, +) -> MetricDelta: + """Build a MetricDelta for cases where comparison is not possible.""" + pct_threshold, abs_threshold = resolve_thresholds(key, thresholds) + return MetricDelta( + key=key, + baseline=baseline, + current=current, + delta=None, + pct_change=None, + unit=unit, + threshold_pct=pct_threshold, + threshold_abs=abs_threshold, + regressed=False, + note=note, + ) + + def compute_delta( key: str, baseline_entry: dict | None, @@ -166,50 +191,22 @@ def compute_delta( current = current_entry.get("value") if current_entry else None unit = (baseline_entry or current_entry or {}).get("unit", "") - pct_threshold, abs_threshold = resolve_thresholds(key, thresholds) - if baseline is None and current is None: - return MetricDelta( - key=key, - baseline=None, - current=None, - delta=None, - pct_change=None, - unit=unit, - threshold_pct=pct_threshold, - threshold_abs=abs_threshold, - regressed=False, - note="no data (neither baseline nor current)", + return _skip_delta( + key, None, None, unit, thresholds, "no data (neither baseline nor current)" ) if baseline is None: - return MetricDelta( - key=key, - baseline=None, - current=current, - delta=None, - pct_change=None, - unit=unit, - threshold_pct=pct_threshold, - threshold_abs=abs_threshold, - regressed=False, - note="new metric (not in baseline)", + return _skip_delta( + key, None, current, unit, thresholds, "new metric (not in baseline)" ) if current is None: - return MetricDelta( - key=key, - baseline=baseline, - current=None, - delta=None, - pct_change=None, - unit=unit, - threshold_pct=pct_threshold, - threshold_abs=abs_threshold, - regressed=False, - note="not captured in current run", + return _skip_delta( + key, baseline, None, unit, thresholds, "not captured in current run" ) + pct_threshold, abs_threshold = resolve_thresholds(key, thresholds) delta = current - baseline pct_change = (delta / baseline * 100.0) if baseline > 0 else None diff --git a/docker/telemetry/workload/prom_queries.py b/docker/telemetry/workload/prom_queries.py index 359ebd956b..b257c61241 100644 --- a/docker/telemetry/workload/prom_queries.py +++ b/docker/telemetry/workload/prom_queries.py @@ -23,7 +23,7 @@ Usage:: plan = build_query_plan("regression-metrics.json", window="3m") async with aiohttp.ClientSession() as s: timings = await run_query_plan(s, "http://localhost:9090", plan) - # timings = {"span.tx.process.p99": 12.4, ...} + # timings = {"span.tx.process.p99": {"value": 12.4, "unit": "ms"}, ...} """ from __future__ import annotations @@ -56,6 +56,62 @@ class QueryEntry: unit: str +def _build_simple_entries( + cfg: dict, + prefix: str, + window: str, +) -> list[QueryEntry]: + """Build QueryEntry list for a single-template category (spans, rpc).""" + tmpl = cfg.get("_query_template", "") + unit = cfg.get("_unit", "ms") + entries: list[QueryEntry] = [] + for name in cfg.get("names", []): + for q in cfg.get("_quantiles", []): + expr = ( + tmpl.replace("{quantile}", _format_quantile(q)) + .replace("{name}", name) + .replace("{window}", window) + ) + entries.append( + QueryEntry( + key=f"{prefix}.{name}.p{_quantile_label(q)}", + promql=expr, + unit=unit, + ) + ) + return entries + + +def _build_job_entries(cfg: dict, window: str) -> list[QueryEntry]: + """Build QueryEntry list for the job_queue category (multi-phase).""" + unit = cfg.get("_unit", "us") + phases = cfg.get("_phases", ["queued", "running"]) + tmpl_map = { + "queued": cfg.get("_queued_template", ""), + "running": cfg.get("_running_template", ""), + } + entries: list[QueryEntry] = [] + for name in cfg.get("names", []): + for phase in phases: + tmpl = tmpl_map.get(phase, "") + if not tmpl: + continue + for q in cfg.get("_quantiles", []): + expr = ( + tmpl.replace("{quantile}", _format_quantile(q)) + .replace("{name}", name) + .replace("{window}", window) + ) + entries.append( + QueryEntry( + key=f"job.{name}.{phase}.p{_quantile_label(q)}", + promql=expr, + unit=unit, + ) + ) + return entries + + def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[QueryEntry]: """Translate regression-metrics.json into a list of PromQL queries. @@ -73,69 +129,9 @@ def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[Query cfg = json.load(f) plan: list[QueryEntry] = [] - - span_cfg = cfg.get("spans", {}) - tmpl = span_cfg.get("_query_template", "") - unit = span_cfg.get("_unit", "ms") - for name in span_cfg.get("names", []): - for q in span_cfg.get("_quantiles", []): - expr = ( - tmpl.replace("{quantile}", _format_quantile(q)) - .replace("{name}", name) - .replace("{window}", window) - ) - plan.append( - QueryEntry( - key=f"span.{name}.p{_quantile_label(q)}", - promql=expr, - unit=unit, - ) - ) - - rpc_cfg = cfg.get("rpc_methods", {}) - tmpl = rpc_cfg.get("_query_template", "") - unit = rpc_cfg.get("_unit", "us") - for name in rpc_cfg.get("names", []): - for q in rpc_cfg.get("_quantiles", []): - expr = ( - tmpl.replace("{quantile}", _format_quantile(q)) - .replace("{name}", name) - .replace("{window}", window) - ) - plan.append( - QueryEntry( - key=f"rpc.{name}.p{_quantile_label(q)}", - promql=expr, - unit=unit, - ) - ) - - job_cfg = cfg.get("job_queue", {}) - unit = job_cfg.get("_unit", "us") - phases = job_cfg.get("_phases", ["queued", "running"]) - tmpl_map = { - "queued": job_cfg.get("_queued_template", ""), - "running": job_cfg.get("_running_template", ""), - } - for name in job_cfg.get("names", []): - for phase in phases: - tmpl = tmpl_map.get(phase, "") - if not tmpl: - continue - for q in job_cfg.get("_quantiles", []): - expr = ( - tmpl.replace("{quantile}", _format_quantile(q)) - .replace("{name}", name) - .replace("{window}", window) - ) - plan.append( - QueryEntry( - key=f"job.{name}.{phase}.p{_quantile_label(q)}", - promql=expr, - unit=unit, - ) - ) - + plan.extend(_build_simple_entries(cfg.get("spans", {}), "span", window)) + plan.extend(_build_simple_entries(cfg.get("rpc_methods", {}), "rpc", window)) + plan.extend(_build_job_entries(cfg.get("job_queue", {}), window)) return plan @@ -178,7 +174,9 @@ async def _instant_query( """ url = f"{prom_url.rstrip('/')}/api/v1/query" try: - async with session.post(url, data={"query": promql}, timeout=30) as resp: + async with session.post( + url, data={"query": promql}, timeout=aiohttp.ClientTimeout(total=30) + ) as resp: if resp.status != 200: logger.warning("query HTTP %d: %s", resp.status, promql) return None diff --git a/docker/telemetry/workload/run-full-validation.sh b/docker/telemetry/workload/run-full-validation.sh index 72a8fe8850..a9a5d43685 100755 --- a/docker/telemetry/workload/run-full-validation.sh +++ b/docker/telemetry/workload/run-full-validation.sh @@ -17,9 +17,9 @@ # ./run-full-validation.sh --cleanup # # Exit codes: -# 0 — All validation checks passed -# 1 — One or more validation checks failed -# 2 — Infrastructure error (cluster/stack failed to start) +# 0 — All validation checks and the regression gate passed +# 1 — Validation checks failed OR the regression gate detected a regression +# 2 — Infrastructure error (cluster/stack failed to start, timing capture failed) set -euo pipefail @@ -385,6 +385,7 @@ if [ "$SKIP_REGRESSION" != true ]; then ok "Timings captured: $TIMINGS_FILE" else fail "Failed to capture timings — skipping regression comparison." + REGRESSION_EXIT=2 SKIP_REGRESSION=true fi fi