Files
rippled/docker/telemetry/workload/capture_timings.py
Pratik Mankawde 577d1f8a21 fix: address review findings in regression gate
- capture_timings.py: fail when captured/total ratio < 50%
  (--min-capture-ratio). Prevents silent pass on unreachable Prometheus.
- run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so
  the final exit code reflects it. Update exit code docs in header.
- compare_to_baseline.py: extract _skip_delta helper to bring
  compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound
  resolution (use explicit None check instead of `or`). Remove dead
  variable override_prefix_key.
- prom_queries.py: extract _build_simple_entries and _build_job_entries
  to bring build_query_plan under 80 lines. Fix module docstring return
  type example. Use aiohttp.ClientTimeout instead of bare int.
- telemetry-validation.yml: add set -euo pipefail to regression summary
  step; guard jq calls with -e flag and fallback; fail on missing
  baseline file; emit ::warning annotation when timings.json missing.
- baselines/README.md: document the placeholder field.
2026-04-24 19:36:15 +01:00

186 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""Capture OTel-derived timings from Prometheus for the regression gate.
Queries Prometheus for every metric declared in ``regression-metrics.json``
and writes the results to a JSON file in the exact schema
``baseline-timings.json`` expects. When a user wants to refresh the
baseline, they copy a CI run's ``timings.json`` artifact (or the block
printed to the workflow step summary) into
``baselines/baseline-timings.json`` in a reviewable PR.
Output schema (stable — ``compare_to_baseline.py`` reads it verbatim)::
{
"schema_version": 1,
"captured_at": "2026-04-24T17:30:00Z",
"window": "3m",
"git_sha": "<from $GITHUB_SHA or `git rev-parse HEAD`>",
"profile": "regression",
"metrics": {
"span.tx.process.p99": {"value": 12.4, "unit": "ms"},
"rpc.server_info.p95": {"value": 850.0, "unit": "us"},
...
}
}
Usage::
python3 capture_timings.py \\
--prometheus http://localhost:9090 \\
--metrics regression-metrics.json \\
--output /tmp/timings.json \\
--window 3m \\
--profile regression
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import os
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
import aiohttp
from prom_queries import build_query_plan, run_query_plan
logger = logging.getLogger("capture_timings")
SCHEMA_VERSION = 1
async def capture(
prom_url: str,
metrics_path: Path,
window: str,
profile: str,
) -> dict:
"""Build and execute the query plan, return the full report dict."""
plan = build_query_plan(metrics_path, window=window)
logger.info("Capturing %d metrics from %s (window=%s)", len(plan), prom_url, window)
async with aiohttp.ClientSession() as session:
metrics = await run_query_plan(session, prom_url, plan)
return {
"schema_version": SCHEMA_VERSION,
"captured_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"window": window,
"git_sha": _detect_git_sha(),
"profile": profile,
"metrics": dict(sorted(metrics.items())),
}
def _detect_git_sha() -> str:
"""Return the current commit SHA from env or git, else ``"unknown"``.
Prefers ``GITHUB_SHA`` (set in Actions), falls back to ``git rev-parse``.
Silent fallback is fine here — a missing SHA only affects the captured
metadata, not the comparison logic.
"""
env_sha = os.environ.get("GITHUB_SHA")
if env_sha:
return env_sha
try:
result = subprocess.run(
["git", "rev-parse", "HEAD"],
capture_output=True,
text=True,
timeout=5,
check=False,
)
if result.returncode == 0:
return result.stdout.strip()
except (OSError, subprocess.SubprocessError):
pass
return "unknown"
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--prometheus",
default="http://localhost:9090",
help="Prometheus base URL (default: http://localhost:9090)",
)
parser.add_argument(
"--metrics",
type=Path,
default=Path(__file__).parent / "regression-metrics.json",
help="Path to regression-metrics.json",
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Where to write the captured timings JSON",
)
parser.add_argument(
"--window",
default="3m",
help="Prometheus rate() window (default: 3m)",
)
parser.add_argument(
"--profile",
default="regression",
help="Workload profile used during capture (metadata only)",
)
parser.add_argument(
"--min-capture-ratio",
type=float,
default=0.5,
help="Fail if fewer than this fraction of metrics are captured (default: 0.5)",
)
parser.add_argument(
"--verbose",
action="store_true",
help="Enable debug logging",
)
args = parser.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(levelname)s %(name)s: %(message)s",
)
report = asyncio.run(
capture(
prom_url=args.prometheus,
metrics_path=args.metrics,
window=args.window,
profile=args.profile,
)
)
args.output.parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w") as f:
json.dump(report, f, indent=2, sort_keys=True)
f.write("\n")
captured = sum(1 for v in report["metrics"].values() if v["value"] is not None)
total = len(report["metrics"])
logger.info("Wrote %s (%d/%d metrics captured)", args.output, captured, total)
if total > 0 and (captured / total) < args.min_capture_ratio:
logger.error(
"Only %d/%d (%.0f%%) metrics captured — below the %.0f%% minimum. "
"Is Prometheus reachable at %s?",
captured,
total,
captured / total * 100,
args.min_capture_ratio * 100,
args.prometheus,
)
return 1
return 0
if __name__ == "__main__":
sys.exit(main())