Files
rippled/docker/telemetry/workload/compare_to_baseline.py
Pratik Mankawde 577d1f8a21 fix: address review findings in regression gate
- capture_timings.py: fail when captured/total ratio < 50%
  (--min-capture-ratio). Prevents silent pass on unreachable Prometheus.
- run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so
  the final exit code reflects it. Update exit code docs in header.
- compare_to_baseline.py: extract _skip_delta helper to bring
  compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound
  resolution (use explicit None check instead of `or`). Remove dead
  variable override_prefix_key.
- prom_queries.py: extract _build_simple_entries and _build_job_entries
  to bring build_query_plan under 80 lines. Fix module docstring return
  type example. Use aiohttp.ClientTimeout instead of bare int.
- telemetry-validation.yml: add set -euo pipefail to regression summary
  step; guard jq calls with -e flag and fallback; fail on missing
  baseline file; emit ::warning annotation when timings.json missing.
- baselines/README.md: document the placeholder field.
2026-04-24 19:36:15 +01:00

402 lines
12 KiB
Python

#!/usr/bin/env python3
"""Compare captured OTel timings against a committed baseline.
Operating modes (chosen automatically based on the baseline file contents):
1. **No baseline** — if ``baseline-timings.json`` has an empty
``metrics`` object (or is marked with ``"placeholder": true``), this
script is in "populate" mode. It prints the captured timings JSON in
the exact format expected for pasting into
``baselines/baseline-timings.json``, then exits 0. No regression check.
2. **Populated baseline** — per-metric percentage AND absolute deltas are
computed against thresholds from ``regression-thresholds.json``. A
regression occurs when BOTH bounds are breached for the same quantile.
Prints a human-readable table and writes a full JSON report.
Exits 1 if any regression was detected, else 0.
Inputs:
--timings Captured timings JSON (from capture_timings.py)
--baseline Committed baseline JSON
--thresholds Threshold policy JSON
--report Where to write regression-report.json (optional)
Exit codes:
0 — No baseline (paste-me emitted), OR baseline populated and no regression
1 — Regression detected (at least one metric breached both bounds)
2 — Internal error (e.g. bad JSON, baseline/current key mismatch)
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any
logger = logging.getLogger("compare_to_baseline")
@dataclass
class MetricDelta:
"""Single metric's baseline-vs-current comparison outcome.
Attributes:
key: Flat metric key (e.g. span.tx.process.p99).
baseline: Baseline value (may be None if unpopulated).
current: Current run value (may be None if not captured).
delta: current - baseline (None if either side None).
pct_change: 100 * delta / baseline (None if baseline ≤ 0).
unit: Unit from baseline (preserved as-is).
threshold_pct: Resolved per-metric pct threshold.
threshold_abs: Resolved per-metric absolute threshold.
regressed: True iff both bounds breached.
note: Human-readable classification when not regressed.
"""
key: str
baseline: float | None
current: float | None
delta: float | None
pct_change: float | None
unit: str
threshold_pct: float | None
threshold_abs: float | None
regressed: bool
note: str
def load_json(path: Path) -> dict:
with open(path) as f:
return json.load(f)
def is_placeholder(baseline: dict) -> bool:
"""A baseline is a placeholder if explicitly marked OR metrics are empty."""
if baseline.get("placeholder") is True:
return True
return not baseline.get("metrics")
def print_paste_me(timings: dict) -> None:
"""Print captured timings in the exact baseline-timings.json format.
The output between the two banner lines is the file contents to paste,
byte-for-byte — sorted keys, 2-space indent, trailing newline.
"""
banner = "=" * 72
print(banner, file=sys.stderr)
print(
" NO BASELINE FOUND — paste the JSON below into",
file=sys.stderr,
)
print(
" docker/telemetry/workload/baselines/baseline-timings.json",
file=sys.stderr,
)
print(banner, file=sys.stderr)
print(json.dumps(timings, indent=2, sort_keys=True))
print(banner, file=sys.stderr)
print(
" (End of paste-me JSON. Gate did NOT run — baseline is empty.)",
file=sys.stderr,
)
print(banner, file=sys.stderr)
def resolve_thresholds(
key: str,
thresholds: dict,
) -> tuple[float | None, float | None]:
"""Return ``(pct_threshold, abs_threshold)`` for a metric key.
Per-metric overrides win over defaults. Returns ``(None, None)`` if no
threshold is defined for this category/quantile — such metrics are
captured but never gate the build.
"""
parts = key.split(".")
if len(parts) < 3:
return (None, None)
category_key = parts[0]
quantile_key = parts[-1]
category_map = {
"span": "span",
"rpc": "rpc_method",
"job": "job_queue",
}
cat = category_map.get(category_key)
if cat is None:
return (None, None)
override_key = f"{category_key}.{'.'.join(parts[1:-1])}"
overrides = thresholds.get("overrides", {})
defaults = thresholds.get("defaults", {}).get(cat, {})
rule = overrides.get(override_key, {}).get(quantile_key)
if rule is None:
rule = defaults.get(quantile_key)
if rule is None:
return (None, None)
pct = rule.get("max_pct_increase")
abs_bound = rule.get("max_abs_increase_ms")
if abs_bound is None:
abs_bound = rule.get("max_abs_increase_us")
return (pct, abs_bound)
def _skip_delta(
key: str,
baseline: float | None,
current: float | None,
unit: str,
thresholds: dict,
note: str,
) -> MetricDelta:
"""Build a MetricDelta for cases where comparison is not possible."""
pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
return MetricDelta(
key=key,
baseline=baseline,
current=current,
delta=None,
pct_change=None,
unit=unit,
threshold_pct=pct_threshold,
threshold_abs=abs_threshold,
regressed=False,
note=note,
)
def compute_delta(
key: str,
baseline_entry: dict | None,
current_entry: dict | None,
thresholds: dict,
) -> MetricDelta:
"""Compute a MetricDelta for one metric key.
A regression requires BOTH bounds to be breached simultaneously. This
tolerates small-value noise: a 100% increase on a 0.5 ms metric
(to 1.0 ms) is not a regression under a 5 ms absolute bound.
"""
baseline = baseline_entry.get("value") if baseline_entry else None
current = current_entry.get("value") if current_entry else None
unit = (baseline_entry or current_entry or {}).get("unit", "")
if baseline is None and current is None:
return _skip_delta(
key, None, None, unit, thresholds, "no data (neither baseline nor current)"
)
if baseline is None:
return _skip_delta(
key, None, current, unit, thresholds, "new metric (not in baseline)"
)
if current is None:
return _skip_delta(
key, baseline, None, unit, thresholds, "not captured in current run"
)
pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
delta = current - baseline
pct_change = (delta / baseline * 100.0) if baseline > 0 else None
if pct_threshold is None or abs_threshold is None:
return MetricDelta(
key=key,
baseline=baseline,
current=current,
delta=delta,
pct_change=pct_change,
unit=unit,
threshold_pct=pct_threshold,
threshold_abs=abs_threshold,
regressed=False,
note="no threshold configured",
)
pct_breach = pct_change is not None and pct_change > pct_threshold
abs_breach = delta > abs_threshold
regressed = pct_breach and abs_breach
if regressed:
note = "REGRESSION"
elif delta < 0:
note = "improved"
else:
note = "within bounds"
return MetricDelta(
key=key,
baseline=baseline,
current=current,
delta=delta,
pct_change=pct_change,
unit=unit,
threshold_pct=pct_threshold,
threshold_abs=abs_threshold,
regressed=regressed,
note=note,
)
def print_summary(deltas: list[MetricDelta]) -> None:
"""Print a sorted, human-readable table of per-metric results."""
regressions = [d for d in deltas if d.regressed]
improvements = [
d
for d in deltas
if d.delta is not None and d.delta < 0 and d.baseline not in (None, 0)
]
improvements.sort(key=lambda d: d.pct_change or 0)
regressions.sort(key=lambda d: -(d.pct_change or 0))
print("=" * 72)
print(f" Regression check: {len(regressions)} regression(s) detected")
print("=" * 72)
if regressions:
print("\nRegressions (breached BOTH pct AND absolute bounds):")
_print_table(regressions)
if improvements:
top = improvements[:5]
print("\nTop improvements:")
_print_table(top)
missing = [d for d in deltas if d.note == "not captured in current run"]
if missing:
print(f"\n{len(missing)} baseline metric(s) not captured in current run:")
for d in missing:
print(f" {d.key}")
def _print_table(rows: list[MetricDelta]) -> None:
"""Print a fixed-width table for a list of deltas."""
header = f" {'METRIC':<45} {'BASE':>10} {'CUR':>10} {'Δ':>10} {'%':>8} UNIT"
print(header)
print(" " + "-" * (len(header) - 2))
for d in rows:
base = f"{d.baseline:.2f}" if d.baseline is not None else "-"
cur = f"{d.current:.2f}" if d.current is not None else "-"
delta = f"{d.delta:+.2f}" if d.delta is not None else "-"
pct = f"{d.pct_change:+.1f}%" if d.pct_change is not None else "-"
print(f" {d.key:<45} {base:>10} {cur:>10} {delta:>10} {pct:>8} {d.unit}")
def write_report(
deltas: list[MetricDelta],
report_path: Path,
baseline: dict,
timings: dict,
) -> None:
"""Write regression-report.json — machine-readable artifact for CI."""
regressions = [d for d in deltas if d.regressed]
payload = {
"schema_version": 1,
"baseline_captured_at": baseline.get("captured_at"),
"baseline_git_sha": baseline.get("git_sha"),
"current_captured_at": timings.get("captured_at"),
"current_git_sha": timings.get("git_sha"),
"window": timings.get("window"),
"profile": timings.get("profile"),
"summary": {
"total": len(deltas),
"regressions": len(regressions),
"improvements": sum(
1
for d in deltas
if d.delta is not None and d.delta < 0 and d.baseline not in (None, 0)
),
"missing_in_current": sum(
1 for d in deltas if d.note == "not captured in current run"
),
},
"metrics": [asdict(d) for d in deltas],
}
report_path.parent.mkdir(parents=True, exist_ok=True)
with open(report_path, "w") as f:
json.dump(payload, f, indent=2, sort_keys=True)
f.write("\n")
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--timings",
type=Path,
required=True,
help="Captured timings JSON (from capture_timings.py)",
)
parser.add_argument(
"--baseline",
type=Path,
required=True,
help="Committed baseline-timings.json",
)
parser.add_argument(
"--thresholds",
type=Path,
default=Path(__file__).parent / "regression-thresholds.json",
help="Threshold policy JSON",
)
parser.add_argument(
"--report",
type=Path,
default=None,
help="Where to write regression-report.json (optional)",
)
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s %(name)s: %(message)s",
)
try:
timings = load_json(args.timings)
baseline = load_json(args.baseline)
thresholds = load_json(args.thresholds)
except (OSError, json.JSONDecodeError) as exc:
logger.error("failed to load inputs: %s", exc)
return 2
if is_placeholder(baseline):
print_paste_me(timings)
return 0
baseline_metrics = baseline.get("metrics", {})
current_metrics = timings.get("metrics", {})
all_keys = sorted(set(baseline_metrics) | set(current_metrics))
deltas = [
compute_delta(
key,
baseline_metrics.get(key),
current_metrics.get(key),
thresholds,
)
for key in all_keys
]
print_summary(deltas)
if args.report:
write_report(deltas, args.report, baseline, timings)
logger.info("wrote %s", args.report)
return 1 if any(d.regressed for d in deltas) else 0
if __name__ == "__main__":
sys.exit(main())