mirror of
https://github.com/XRPLF/rippled.git
synced 2026-04-29 15:37:57 +00:00
- capture_timings.py: fail when captured/total ratio < 50% (--min-capture-ratio). Prevents silent pass on unreachable Prometheus. - run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so the final exit code reflects it. Update exit code docs in header. - compare_to_baseline.py: extract _skip_delta helper to bring compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound resolution (use explicit None check instead of `or`). Remove dead variable override_prefix_key. - prom_queries.py: extract _build_simple_entries and _build_job_entries to bring build_query_plan under 80 lines. Fix module docstring return type example. Use aiohttp.ClientTimeout instead of bare int. - telemetry-validation.yml: add set -euo pipefail to regression summary step; guard jq calls with -e flag and fallback; fail on missing baseline file; emit ::warning annotation when timings.json missing. - baselines/README.md: document the placeholder field.
186 lines
5.2 KiB
Python
186 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Capture OTel-derived timings from Prometheus for the regression gate.
|
|
|
|
Queries Prometheus for every metric declared in ``regression-metrics.json``
|
|
and writes the results to a JSON file in the exact schema
|
|
``baseline-timings.json`` expects. When a user wants to refresh the
|
|
baseline, they copy a CI run's ``timings.json`` artifact (or the block
|
|
printed to the workflow step summary) into
|
|
``baselines/baseline-timings.json`` in a reviewable PR.
|
|
|
|
Output schema (stable — ``compare_to_baseline.py`` reads it verbatim)::
|
|
|
|
{
|
|
"schema_version": 1,
|
|
"captured_at": "2026-04-24T17:30:00Z",
|
|
"window": "3m",
|
|
"git_sha": "<from $GITHUB_SHA or `git rev-parse HEAD`>",
|
|
"profile": "regression",
|
|
"metrics": {
|
|
"span.tx.process.p99": {"value": 12.4, "unit": "ms"},
|
|
"rpc.server_info.p95": {"value": 850.0, "unit": "us"},
|
|
...
|
|
}
|
|
}
|
|
|
|
Usage::
|
|
|
|
python3 capture_timings.py \\
|
|
--prometheus http://localhost:9090 \\
|
|
--metrics regression-metrics.json \\
|
|
--output /tmp/timings.json \\
|
|
--window 3m \\
|
|
--profile regression
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
import aiohttp
|
|
|
|
from prom_queries import build_query_plan, run_query_plan
|
|
|
|
logger = logging.getLogger("capture_timings")
|
|
|
|
SCHEMA_VERSION = 1
|
|
|
|
|
|
async def capture(
|
|
prom_url: str,
|
|
metrics_path: Path,
|
|
window: str,
|
|
profile: str,
|
|
) -> dict:
|
|
"""Build and execute the query plan, return the full report dict."""
|
|
plan = build_query_plan(metrics_path, window=window)
|
|
logger.info("Capturing %d metrics from %s (window=%s)", len(plan), prom_url, window)
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
metrics = await run_query_plan(session, prom_url, plan)
|
|
|
|
return {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"captured_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
"window": window,
|
|
"git_sha": _detect_git_sha(),
|
|
"profile": profile,
|
|
"metrics": dict(sorted(metrics.items())),
|
|
}
|
|
|
|
|
|
def _detect_git_sha() -> str:
|
|
"""Return the current commit SHA from env or git, else ``"unknown"``.
|
|
|
|
Prefers ``GITHUB_SHA`` (set in Actions), falls back to ``git rev-parse``.
|
|
Silent fallback is fine here — a missing SHA only affects the captured
|
|
metadata, not the comparison logic.
|
|
"""
|
|
env_sha = os.environ.get("GITHUB_SHA")
|
|
if env_sha:
|
|
return env_sha
|
|
try:
|
|
result = subprocess.run(
|
|
["git", "rev-parse", "HEAD"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5,
|
|
check=False,
|
|
)
|
|
if result.returncode == 0:
|
|
return result.stdout.strip()
|
|
except (OSError, subprocess.SubprocessError):
|
|
pass
|
|
return "unknown"
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument(
|
|
"--prometheus",
|
|
default="http://localhost:9090",
|
|
help="Prometheus base URL (default: http://localhost:9090)",
|
|
)
|
|
parser.add_argument(
|
|
"--metrics",
|
|
type=Path,
|
|
default=Path(__file__).parent / "regression-metrics.json",
|
|
help="Path to regression-metrics.json",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
required=True,
|
|
help="Where to write the captured timings JSON",
|
|
)
|
|
parser.add_argument(
|
|
"--window",
|
|
default="3m",
|
|
help="Prometheus rate() window (default: 3m)",
|
|
)
|
|
parser.add_argument(
|
|
"--profile",
|
|
default="regression",
|
|
help="Workload profile used during capture (metadata only)",
|
|
)
|
|
parser.add_argument(
|
|
"--min-capture-ratio",
|
|
type=float,
|
|
default=0.5,
|
|
help="Fail if fewer than this fraction of metrics are captured (default: 0.5)",
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
action="store_true",
|
|
help="Enable debug logging",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
format="%(levelname)s %(name)s: %(message)s",
|
|
)
|
|
|
|
report = asyncio.run(
|
|
capture(
|
|
prom_url=args.prometheus,
|
|
metrics_path=args.metrics,
|
|
window=args.window,
|
|
profile=args.profile,
|
|
)
|
|
)
|
|
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(args.output, "w") as f:
|
|
json.dump(report, f, indent=2, sort_keys=True)
|
|
f.write("\n")
|
|
|
|
captured = sum(1 for v in report["metrics"].values() if v["value"] is not None)
|
|
total = len(report["metrics"])
|
|
logger.info("Wrote %s (%d/%d metrics captured)", args.output, captured, total)
|
|
|
|
if total > 0 and (captured / total) < args.min_capture_ratio:
|
|
logger.error(
|
|
"Only %d/%d (%.0f%%) metrics captured — below the %.0f%% minimum. "
|
|
"Is Prometheus reachable at %s?",
|
|
captured,
|
|
total,
|
|
captured / total * 100,
|
|
args.min_capture_ratio * 100,
|
|
args.prometheus,
|
|
)
|
|
return 1
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|