rippled/docker/telemetry/workload/compare_to_baseline.py

#!/usr/bin/env python3
"""Compare captured OTel timings against a committed baseline.

Operating modes (chosen automatically based on the baseline file contents):

1. **No baseline** — if ``baseline-timings.json`` has an empty
   ``metrics`` object (or is marked with ``"placeholder": true``), this
   script is in "populate" mode. It prints the captured timings JSON in
   the exact format expected for pasting into
   ``baselines/baseline-timings.json``, then exits 0. No regression check.

2. **Populated baseline** — per-metric percentage AND absolute deltas are
   computed against thresholds from ``regression-thresholds.json``. A
   regression occurs when BOTH bounds are breached for the same quantile.
   Prints a human-readable table and writes a full JSON report.
   Exits 1 if any regression was detected, else 0.

Inputs:
    --timings     Captured timings JSON (from capture_timings.py)
    --baseline    Committed baseline JSON
    --thresholds  Threshold policy JSON
    --report      Where to write regression-report.json (optional)

Exit codes:
    0 — No baseline (paste-me emitted), OR baseline populated and no regression
    1 — Regression detected (at least one metric breached both bounds)
    2 — Internal error (e.g. bad JSON, baseline/current key mismatch)
"""

from __future__ import annotations

import argparse
import json
import logging
import sys
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any

logger = logging.getLogger("compare_to_baseline")


@dataclass
class MetricDelta:
    """Single metric's baseline-vs-current comparison outcome.

    Attributes:
        key:                Flat metric key (e.g. span.tx.process.p99).
        baseline:           Baseline value (may be None if unpopulated).
        current:            Current run value (may be None if not captured).
        delta:              current - baseline (None if either side None).
        pct_change:         100 * delta / baseline (None if baseline ≤ 0).
        unit:               Unit from baseline (preserved as-is).
        threshold_pct:      Resolved per-metric pct threshold.
        threshold_abs:      Resolved per-metric absolute threshold.
        regressed:          True iff both bounds breached.
        note:               Human-readable classification when not regressed.
    """

    key: str
    baseline: float | None
    current: float | None
    delta: float | None
    pct_change: float | None
    unit: str
    threshold_pct: float | None
    threshold_abs: float | None
    regressed: bool
    note: str


def load_json(path: Path) -> dict:
    with open(path) as f:
        return json.load(f)


def is_placeholder(baseline: dict) -> bool:
    """A baseline is a placeholder if explicitly marked OR metrics are empty."""
    if baseline.get("placeholder") is True:
        return True
    return not baseline.get("metrics")


def print_paste_me(timings: dict) -> None:
    """Print captured timings in the exact baseline-timings.json format.

    The output between the two banner lines is the file contents to paste,
    byte-for-byte — sorted keys, 2-space indent, trailing newline.
    """
    banner = "=" * 72
    print(banner, file=sys.stderr)
    print(
        "  NO BASELINE FOUND — paste the JSON below into",
        file=sys.stderr,
    )
    print(
        "  docker/telemetry/workload/baselines/baseline-timings.json",
        file=sys.stderr,
    )
    print(banner, file=sys.stderr)

    print(json.dumps(timings, indent=2, sort_keys=True))

    print(banner, file=sys.stderr)
    print(
        "  (End of paste-me JSON. Gate did NOT run — baseline is empty.)",
        file=sys.stderr,
    )
    print(banner, file=sys.stderr)


def resolve_thresholds(
    key: str,
    thresholds: dict,
) -> tuple[float | None, float | None]:
    """Return ``(pct_threshold, abs_threshold)`` for a metric key.

    Per-metric overrides win over defaults. Returns ``(None, None)`` if no
    threshold is defined for this category/quantile — such metrics are
    captured but never gate the build.
    """
    parts = key.split(".")
    if len(parts) < 3:
        return (None, None)
    category_key = parts[0]
    quantile_key = parts[-1]

    category_map = {
        "span": "span",
        "rpc": "rpc_method",
        "job": "job_queue",
    }
    cat = category_map.get(category_key)
    if cat is None:
        return (None, None)

    override_key = f"{category_key}.{'.'.join(parts[1:-1])}"
    overrides = thresholds.get("overrides", {})
    defaults = thresholds.get("defaults", {}).get(cat, {})

    rule = overrides.get(override_key, {}).get(quantile_key)
    if rule is None:
        rule = defaults.get(quantile_key)
    if rule is None:
        return (None, None)

    pct = rule.get("max_pct_increase")
    abs_bound = rule.get("max_abs_increase_ms")
    if abs_bound is None:
        abs_bound = rule.get("max_abs_increase_us")
    return (pct, abs_bound)


def _skip_delta(
    key: str,
    baseline: float | None,
    current: float | None,
    unit: str,
    thresholds: dict,
    note: str,
) -> MetricDelta:
    """Build a MetricDelta for cases where comparison is not possible."""
    pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
    return MetricDelta(
        key=key,
        baseline=baseline,
        current=current,
        delta=None,
        pct_change=None,
        unit=unit,
        threshold_pct=pct_threshold,
        threshold_abs=abs_threshold,
        regressed=False,
        note=note,
    )


def compute_delta(
    key: str,
    baseline_entry: dict | None,
    current_entry: dict | None,
    thresholds: dict,
) -> MetricDelta:
    """Compute a MetricDelta for one metric key.

    A regression requires BOTH bounds to be breached simultaneously. This
    tolerates small-value noise: a 100% increase on a 0.5 ms metric
    (to 1.0 ms) is not a regression under a 5 ms absolute bound.
    """
    baseline = baseline_entry.get("value") if baseline_entry else None
    current = current_entry.get("value") if current_entry else None
    unit = (baseline_entry or current_entry or {}).get("unit", "")

    if baseline is None and current is None:
        return _skip_delta(
            key, None, None, unit, thresholds, "no data (neither baseline nor current)"
        )

    if baseline is None:
        return _skip_delta(
            key, None, current, unit, thresholds, "new metric (not in baseline)"
        )

    if current is None:
        return _skip_delta(
            key, baseline, None, unit, thresholds, "not captured in current run"
        )

    pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
    delta = current - baseline
    pct_change = (delta / baseline * 100.0) if baseline > 0 else None

    if pct_threshold is None or abs_threshold is None:
        return MetricDelta(
            key=key,
            baseline=baseline,
            current=current,
            delta=delta,
            pct_change=pct_change,
            unit=unit,
            threshold_pct=pct_threshold,
            threshold_abs=abs_threshold,
            regressed=False,
            note="no threshold configured",
        )

    pct_breach = pct_change is not None and pct_change > pct_threshold
    abs_breach = delta > abs_threshold
    regressed = pct_breach and abs_breach

    if regressed:
        note = "REGRESSION"
    elif delta < 0:
        note = "improved"
    else:
        note = "within bounds"

    return MetricDelta(
        key=key,
        baseline=baseline,
        current=current,
        delta=delta,
        pct_change=pct_change,
        unit=unit,
        threshold_pct=pct_threshold,
        threshold_abs=abs_threshold,
        regressed=regressed,
        note=note,
    )


def print_summary(deltas: list[MetricDelta]) -> None:
    """Print a sorted, human-readable table of per-metric results."""
    regressions = [d for d in deltas if d.regressed]
    improvements = [
        d
        for d in deltas
        if d.delta is not None and d.delta < 0 and d.baseline not in (None, 0)
    ]
    improvements.sort(key=lambda d: d.pct_change or 0)
    regressions.sort(key=lambda d: -(d.pct_change or 0))

    print("=" * 72)
    print(f"  Regression check: {len(regressions)} regression(s) detected")
    print("=" * 72)

    if regressions:
        print("\nRegressions (breached BOTH pct AND absolute bounds):")
        _print_table(regressions)

    if improvements:
        top = improvements[:5]
        print("\nTop improvements:")
        _print_table(top)

    missing = [d for d in deltas if d.note == "not captured in current run"]
    if missing:
        print(f"\n{len(missing)} baseline metric(s) not captured in current run:")
        for d in missing:
            print(f"  {d.key}")


def _print_table(rows: list[MetricDelta]) -> None:
    """Print a fixed-width table for a list of deltas."""
    header = f"  {'METRIC':<45} {'BASE':>10} {'CUR':>10} {'Δ':>10} {'%':>8} UNIT"
    print(header)
    print("  " + "-" * (len(header) - 2))
    for d in rows:
        base = f"{d.baseline:.2f}" if d.baseline is not None else "-"
        cur = f"{d.current:.2f}" if d.current is not None else "-"
        delta = f"{d.delta:+.2f}" if d.delta is not None else "-"
        pct = f"{d.pct_change:+.1f}%" if d.pct_change is not None else "-"
        print(f"  {d.key:<45} {base:>10} {cur:>10} {delta:>10} {pct:>8} {d.unit}")


def write_report(
    deltas: list[MetricDelta],
    report_path: Path,
    baseline: dict,
    timings: dict,
) -> None:
    """Write regression-report.json — machine-readable artifact for CI."""
    regressions = [d for d in deltas if d.regressed]
    payload = {
        "schema_version": 1,
        "baseline_captured_at": baseline.get("captured_at"),
        "baseline_git_sha": baseline.get("git_sha"),
        "current_captured_at": timings.get("captured_at"),
        "current_git_sha": timings.get("git_sha"),
        "window": timings.get("window"),
        "profile": timings.get("profile"),
        "summary": {
            "total": len(deltas),
            "regressions": len(regressions),
            "improvements": sum(
                1
                for d in deltas
                if d.delta is not None and d.delta < 0 and d.baseline not in (None, 0)
            ),
            "missing_in_current": sum(
                1 for d in deltas if d.note == "not captured in current run"
            ),
        },
        "metrics": [asdict(d) for d in deltas],
    }
    report_path.parent.mkdir(parents=True, exist_ok=True)
    with open(report_path, "w") as f:
        json.dump(payload, f, indent=2, sort_keys=True)
        f.write("\n")


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--timings",
        type=Path,
        required=True,
        help="Captured timings JSON (from capture_timings.py)",
    )
    parser.add_argument(
        "--baseline",
        type=Path,
        required=True,
        help="Committed baseline-timings.json",
    )
    parser.add_argument(
        "--thresholds",
        type=Path,
        default=Path(__file__).parent / "regression-thresholds.json",
        help="Threshold policy JSON",
    )
    parser.add_argument(
        "--report",
        type=Path,
        default=None,
        help="Where to write regression-report.json (optional)",
    )
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format="%(levelname)s %(name)s: %(message)s",
    )

    try:
        timings = load_json(args.timings)
        baseline = load_json(args.baseline)
        thresholds = load_json(args.thresholds)
    except (OSError, json.JSONDecodeError) as exc:
        logger.error("failed to load inputs: %s", exc)
        return 2

    if is_placeholder(baseline):
        print_paste_me(timings)
        return 0

    baseline_metrics = baseline.get("metrics", {})
    current_metrics = timings.get("metrics", {})

    all_keys = sorted(set(baseline_metrics) | set(current_metrics))
    deltas = [
        compute_delta(
            key,
            baseline_metrics.get(key),
            current_metrics.get(key),
            thresholds,
        )
        for key in all_keys
    ]

    print_summary(deltas)

    if args.report:
        write_report(deltas, args.report, baseline, timings)
        logger.info("wrote %s", args.report)

    return 1 if any(d.regressed for d in deltas) else 0


if __name__ == "__main__":
    sys.exit(main())