fix: address review findings in regression gate

- capture_timings.py: fail when captured/total ratio < 50%
  (--min-capture-ratio). Prevents silent pass on unreachable Prometheus.
- run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so
  the final exit code reflects it. Update exit code docs in header.
- compare_to_baseline.py: extract _skip_delta helper to bring
  compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound
  resolution (use explicit None check instead of `or`). Remove dead
  variable override_prefix_key.
- prom_queries.py: extract _build_simple_entries and _build_job_entries
  to bring build_query_plan under 80 lines. Fix module docstring return
  type example. Use aiohttp.ClientTimeout instead of bare int.
- telemetry-validation.yml: add set -euo pipefail to regression summary
  step; guard jq calls with -e flag and fallback; fail on missing
  baseline file; emit ::warning annotation when timings.json missing.
- baselines/README.md: document the placeholder field.
This commit is contained in:
Pratik Mankawde
2026-04-24 19:36:15 +01:00
parent df79d5e74b
commit 577d1f8a21
6 changed files with 139 additions and 109 deletions

View File

@@ -238,16 +238,27 @@ jobs:
- name: Print regression summary
if: always()
run: |
set -euo pipefail
TIMINGS="/tmp/xrpld-validation/reports/timings.json"
REGRESSION="/tmp/xrpld-validation/reports/regression-report.json"
BASELINE="docker/telemetry/workload/baselines/baseline-timings.json"
if [ ! -f "$TIMINGS" ]; then
echo "## Regression Gate: no timings captured" >> "$GITHUB_STEP_SUMMARY"
echo "::warning::capture_timings.py did not produce timings.json — regression gate was not evaluated."
exit 0
fi
IS_PLACEHOLDER=$(jq -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE")
if [ ! -f "$BASELINE" ]; then
echo "## Regression Gate: baseline file missing" >> "$GITHUB_STEP_SUMMARY"
echo "::error::baselines/baseline-timings.json not found in checkout"
exit 1
fi
IS_PLACEHOLDER=$(jq -e -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE") || {
echo "::error::Failed to parse baseline JSON"
exit 1
}
echo "## OTel Timings Regression Gate" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
@@ -263,9 +274,9 @@ jobs:
cat "$TIMINGS" >> "$GITHUB_STEP_SUMMARY"
echo '```' >> "$GITHUB_STEP_SUMMARY"
elif [ -f "$REGRESSION" ]; then
REGR_COUNT=$(jq '.summary.regressions' "$REGRESSION")
IMPR_COUNT=$(jq '.summary.improvements' "$REGRESSION")
TOTAL=$(jq '.summary.total' "$REGRESSION")
REGR_COUNT=$(jq -e '.summary.regressions' "$REGRESSION") || REGR_COUNT=0
IMPR_COUNT=$(jq -e '.summary.improvements' "$REGRESSION") || IMPR_COUNT=0
TOTAL=$(jq -e '.summary.total' "$REGRESSION") || TOTAL=0
echo "| Stat | Count |" >> "$GITHUB_STEP_SUMMARY"
echo "|------|-------|" >> "$GITHUB_STEP_SUMMARY"
echo "| Metrics compared | $TOTAL |" >> "$GITHUB_STEP_SUMMARY"

View File

@@ -62,6 +62,11 @@ should trace back to a real CI run so variance characteristics are preserved.
}
```
Placeholder baselines additionally include `"placeholder": true`. The comparator
detects this field (or an empty `metrics` object) to switch into "populate" mode
instead of enforcing thresholds. Remove the `placeholder` key when pasting real
captured timings.
Missing metrics (value `null`) in a captured run do not count as regressions — they
are reported separately in `regression-report.json` under `missing_in_current`.
This keeps the gate robust when a profile doesn't exercise every span on every run.

View File

@@ -131,6 +131,12 @@ def main() -> int:
default="regression",
help="Workload profile used during capture (metadata only)",
)
parser.add_argument(
"--min-capture-ratio",
type=float,
default=0.5,
help="Fail if fewer than this fraction of metrics are captured (default: 0.5)",
)
parser.add_argument(
"--verbose",
action="store_true",
@@ -160,6 +166,18 @@ def main() -> int:
captured = sum(1 for v in report["metrics"].values() if v["value"] is not None)
total = len(report["metrics"])
logger.info("Wrote %s (%d/%d metrics captured)", args.output, captured, total)
if total > 0 and (captured / total) < args.min_capture_ratio:
logger.error(
"Only %d/%d (%.0f%%) metrics captured — below the %.0f%% minimum. "
"Is Prometheus reachable at %s?",
captured,
total,
captured / total * 100,
args.min_capture_ratio * 100,
args.prometheus,
)
return 1
return 0

View File

@@ -134,7 +134,6 @@ def resolve_thresholds(
if cat is None:
return (None, None)
override_prefix_key = ".".join(parts[:-1])
override_key = f"{category_key}.{'.'.join(parts[1:-1])}"
overrides = thresholds.get("overrides", {})
defaults = thresholds.get("defaults", {}).get(cat, {})
@@ -146,10 +145,36 @@ def resolve_thresholds(
return (None, None)
pct = rule.get("max_pct_increase")
abs_bound = rule.get("max_abs_increase_ms") or rule.get("max_abs_increase_us")
abs_bound = rule.get("max_abs_increase_ms")
if abs_bound is None:
abs_bound = rule.get("max_abs_increase_us")
return (pct, abs_bound)
def _skip_delta(
key: str,
baseline: float | None,
current: float | None,
unit: str,
thresholds: dict,
note: str,
) -> MetricDelta:
"""Build a MetricDelta for cases where comparison is not possible."""
pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
return MetricDelta(
key=key,
baseline=baseline,
current=current,
delta=None,
pct_change=None,
unit=unit,
threshold_pct=pct_threshold,
threshold_abs=abs_threshold,
regressed=False,
note=note,
)
def compute_delta(
key: str,
baseline_entry: dict | None,
@@ -166,50 +191,22 @@ def compute_delta(
current = current_entry.get("value") if current_entry else None
unit = (baseline_entry or current_entry or {}).get("unit", "")
pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
if baseline is None and current is None:
return MetricDelta(
key=key,
baseline=None,
current=None,
delta=None,
pct_change=None,
unit=unit,
threshold_pct=pct_threshold,
threshold_abs=abs_threshold,
regressed=False,
note="no data (neither baseline nor current)",
return _skip_delta(
key, None, None, unit, thresholds, "no data (neither baseline nor current)"
)
if baseline is None:
return MetricDelta(
key=key,
baseline=None,
current=current,
delta=None,
pct_change=None,
unit=unit,
threshold_pct=pct_threshold,
threshold_abs=abs_threshold,
regressed=False,
note="new metric (not in baseline)",
return _skip_delta(
key, None, current, unit, thresholds, "new metric (not in baseline)"
)
if current is None:
return MetricDelta(
key=key,
baseline=baseline,
current=None,
delta=None,
pct_change=None,
unit=unit,
threshold_pct=pct_threshold,
threshold_abs=abs_threshold,
regressed=False,
note="not captured in current run",
return _skip_delta(
key, baseline, None, unit, thresholds, "not captured in current run"
)
pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
delta = current - baseline
pct_change = (delta / baseline * 100.0) if baseline > 0 else None

View File

@@ -23,7 +23,7 @@ Usage::
plan = build_query_plan("regression-metrics.json", window="3m")
async with aiohttp.ClientSession() as s:
timings = await run_query_plan(s, "http://localhost:9090", plan)
# timings = {"span.tx.process.p99": 12.4, ...}
# timings = {"span.tx.process.p99": {"value": 12.4, "unit": "ms"}, ...}
"""
from __future__ import annotations
@@ -56,6 +56,62 @@ class QueryEntry:
unit: str
def _build_simple_entries(
cfg: dict,
prefix: str,
window: str,
) -> list[QueryEntry]:
"""Build QueryEntry list for a single-template category (spans, rpc)."""
tmpl = cfg.get("_query_template", "")
unit = cfg.get("_unit", "ms")
entries: list[QueryEntry] = []
for name in cfg.get("names", []):
for q in cfg.get("_quantiles", []):
expr = (
tmpl.replace("{quantile}", _format_quantile(q))
.replace("{name}", name)
.replace("{window}", window)
)
entries.append(
QueryEntry(
key=f"{prefix}.{name}.p{_quantile_label(q)}",
promql=expr,
unit=unit,
)
)
return entries
def _build_job_entries(cfg: dict, window: str) -> list[QueryEntry]:
"""Build QueryEntry list for the job_queue category (multi-phase)."""
unit = cfg.get("_unit", "us")
phases = cfg.get("_phases", ["queued", "running"])
tmpl_map = {
"queued": cfg.get("_queued_template", ""),
"running": cfg.get("_running_template", ""),
}
entries: list[QueryEntry] = []
for name in cfg.get("names", []):
for phase in phases:
tmpl = tmpl_map.get(phase, "")
if not tmpl:
continue
for q in cfg.get("_quantiles", []):
expr = (
tmpl.replace("{quantile}", _format_quantile(q))
.replace("{name}", name)
.replace("{window}", window)
)
entries.append(
QueryEntry(
key=f"job.{name}.{phase}.p{_quantile_label(q)}",
promql=expr,
unit=unit,
)
)
return entries
def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[QueryEntry]:
"""Translate regression-metrics.json into a list of PromQL queries.
@@ -73,69 +129,9 @@ def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[Query
cfg = json.load(f)
plan: list[QueryEntry] = []
span_cfg = cfg.get("spans", {})
tmpl = span_cfg.get("_query_template", "")
unit = span_cfg.get("_unit", "ms")
for name in span_cfg.get("names", []):
for q in span_cfg.get("_quantiles", []):
expr = (
tmpl.replace("{quantile}", _format_quantile(q))
.replace("{name}", name)
.replace("{window}", window)
)
plan.append(
QueryEntry(
key=f"span.{name}.p{_quantile_label(q)}",
promql=expr,
unit=unit,
)
)
rpc_cfg = cfg.get("rpc_methods", {})
tmpl = rpc_cfg.get("_query_template", "")
unit = rpc_cfg.get("_unit", "us")
for name in rpc_cfg.get("names", []):
for q in rpc_cfg.get("_quantiles", []):
expr = (
tmpl.replace("{quantile}", _format_quantile(q))
.replace("{name}", name)
.replace("{window}", window)
)
plan.append(
QueryEntry(
key=f"rpc.{name}.p{_quantile_label(q)}",
promql=expr,
unit=unit,
)
)
job_cfg = cfg.get("job_queue", {})
unit = job_cfg.get("_unit", "us")
phases = job_cfg.get("_phases", ["queued", "running"])
tmpl_map = {
"queued": job_cfg.get("_queued_template", ""),
"running": job_cfg.get("_running_template", ""),
}
for name in job_cfg.get("names", []):
for phase in phases:
tmpl = tmpl_map.get(phase, "")
if not tmpl:
continue
for q in job_cfg.get("_quantiles", []):
expr = (
tmpl.replace("{quantile}", _format_quantile(q))
.replace("{name}", name)
.replace("{window}", window)
)
plan.append(
QueryEntry(
key=f"job.{name}.{phase}.p{_quantile_label(q)}",
promql=expr,
unit=unit,
)
)
plan.extend(_build_simple_entries(cfg.get("spans", {}), "span", window))
plan.extend(_build_simple_entries(cfg.get("rpc_methods", {}), "rpc", window))
plan.extend(_build_job_entries(cfg.get("job_queue", {}), window))
return plan
@@ -178,7 +174,9 @@ async def _instant_query(
"""
url = f"{prom_url.rstrip('/')}/api/v1/query"
try:
async with session.post(url, data={"query": promql}, timeout=30) as resp:
async with session.post(
url, data={"query": promql}, timeout=aiohttp.ClientTimeout(total=30)
) as resp:
if resp.status != 200:
logger.warning("query HTTP %d: %s", resp.status, promql)
return None

View File

@@ -17,9 +17,9 @@
# ./run-full-validation.sh --cleanup
#
# Exit codes:
# 0 — All validation checks passed
# 1 — One or more validation checks failed
# 2 — Infrastructure error (cluster/stack failed to start)
# 0 — All validation checks and the regression gate passed
# 1 — Validation checks failed OR the regression gate detected a regression
# 2 — Infrastructure error (cluster/stack failed to start, timing capture failed)
set -euo pipefail
@@ -385,6 +385,7 @@ if [ "$SKIP_REGRESSION" != true ]; then
ok "Timings captured: $TIMINGS_FILE"
else
fail "Failed to capture timings — skipping regression comparison."
REGRESSION_EXIT=2
SKIP_REGRESSION=true
fi
fi