mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-03 00:36:48 +00:00
- capture_timings.py: fail when captured/total ratio < 50% (--min-capture-ratio). Prevents silent pass on unreachable Prometheus. - run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so the final exit code reflects it. Update exit code docs in header. - compare_to_baseline.py: extract _skip_delta helper to bring compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound resolution (use explicit None check instead of `or`). Remove dead variable override_prefix_key. - prom_queries.py: extract _build_simple_entries and _build_job_entries to bring build_query_plan under 80 lines. Fix module docstring return type example. Use aiohttp.ClientTimeout instead of bare int. - telemetry-validation.yml: add set -euo pipefail to regression summary step; guard jq calls with -e flag and fallback; fail on missing baseline file; emit ::warning annotation when timings.json missing. - baselines/README.md: document the placeholder field.
213 lines
7.1 KiB
Python
213 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
||
"""Shared Prometheus query helpers for the regression gate.
|
||
|
||
Single source of truth for how regression metrics are computed. Both
|
||
``capture_timings.py`` and any future tooling consume this module so metric
|
||
name → PromQL expression stays consistent.
|
||
|
||
Design:
|
||
- Every captured metric has a key in the form ``{category}.{name}.p{quantile}``
|
||
(e.g. ``span.tx.process.p99``). Keys are flat strings so JSON diffing is
|
||
trivial.
|
||
- Quantile queries go through ``histogram_quantile`` over the standard
|
||
``_bucket`` series. The rate window is a parameter (defaults to the
|
||
capture window, not Prometheus's default 5m) so short CI runs are usable.
|
||
- The catalogue of what to capture lives in ``regression-metrics.json`` —
|
||
this module only knows how to translate that JSON into HTTP queries.
|
||
|
||
Usage::
|
||
|
||
import asyncio, aiohttp
|
||
from prom_queries import build_query_plan, run_query_plan
|
||
|
||
plan = build_query_plan("regression-metrics.json", window="3m")
|
||
async with aiohttp.ClientSession() as s:
|
||
timings = await run_query_plan(s, "http://localhost:9090", plan)
|
||
# timings = {"span.tx.process.p99": {"value": 12.4, "unit": "ms"}, ...}
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import logging
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import aiohttp
|
||
|
||
logger = logging.getLogger("prom_queries")
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class QueryEntry:
|
||
"""One metric to capture from Prometheus.
|
||
|
||
Attributes:
|
||
key: Flat output key, e.g. ``span.tx.process.p99``.
|
||
promql: The PromQL expression to send to /api/v1/query.
|
||
unit: Unit of the returned value, e.g. ``ms`` or ``us``.
|
||
Baseline JSON preserves this so the comparator can
|
||
sanity-check unit drift.
|
||
"""
|
||
|
||
key: str
|
||
promql: str
|
||
unit: str
|
||
|
||
|
||
def _build_simple_entries(
|
||
cfg: dict,
|
||
prefix: str,
|
||
window: str,
|
||
) -> list[QueryEntry]:
|
||
"""Build QueryEntry list for a single-template category (spans, rpc)."""
|
||
tmpl = cfg.get("_query_template", "")
|
||
unit = cfg.get("_unit", "ms")
|
||
entries: list[QueryEntry] = []
|
||
for name in cfg.get("names", []):
|
||
for q in cfg.get("_quantiles", []):
|
||
expr = (
|
||
tmpl.replace("{quantile}", _format_quantile(q))
|
||
.replace("{name}", name)
|
||
.replace("{window}", window)
|
||
)
|
||
entries.append(
|
||
QueryEntry(
|
||
key=f"{prefix}.{name}.p{_quantile_label(q)}",
|
||
promql=expr,
|
||
unit=unit,
|
||
)
|
||
)
|
||
return entries
|
||
|
||
|
||
def _build_job_entries(cfg: dict, window: str) -> list[QueryEntry]:
|
||
"""Build QueryEntry list for the job_queue category (multi-phase)."""
|
||
unit = cfg.get("_unit", "us")
|
||
phases = cfg.get("_phases", ["queued", "running"])
|
||
tmpl_map = {
|
||
"queued": cfg.get("_queued_template", ""),
|
||
"running": cfg.get("_running_template", ""),
|
||
}
|
||
entries: list[QueryEntry] = []
|
||
for name in cfg.get("names", []):
|
||
for phase in phases:
|
||
tmpl = tmpl_map.get(phase, "")
|
||
if not tmpl:
|
||
continue
|
||
for q in cfg.get("_quantiles", []):
|
||
expr = (
|
||
tmpl.replace("{quantile}", _format_quantile(q))
|
||
.replace("{name}", name)
|
||
.replace("{window}", window)
|
||
)
|
||
entries.append(
|
||
QueryEntry(
|
||
key=f"job.{name}.{phase}.p{_quantile_label(q)}",
|
||
promql=expr,
|
||
unit=unit,
|
||
)
|
||
)
|
||
return entries
|
||
|
||
|
||
def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[QueryEntry]:
|
||
"""Translate regression-metrics.json into a list of PromQL queries.
|
||
|
||
Args:
|
||
metrics_path: Path to ``regression-metrics.json``.
|
||
window: Rate window passed to ``rate()``. For short CI runs
|
||
keep this close to the test duration so the bucket
|
||
counts are meaningful. Default 3m matches the
|
||
``regression`` workload profile.
|
||
|
||
Returns:
|
||
A list of ``QueryEntry`` values, one per (metric × quantile).
|
||
"""
|
||
with open(metrics_path) as f:
|
||
cfg = json.load(f)
|
||
|
||
plan: list[QueryEntry] = []
|
||
plan.extend(_build_simple_entries(cfg.get("spans", {}), "span", window))
|
||
plan.extend(_build_simple_entries(cfg.get("rpc_methods", {}), "rpc", window))
|
||
plan.extend(_build_job_entries(cfg.get("job_queue", {}), window))
|
||
return plan
|
||
|
||
|
||
async def run_query_plan(
|
||
session: aiohttp.ClientSession,
|
||
prom_url: str,
|
||
plan: list[QueryEntry],
|
||
) -> dict[str, dict[str, Any]]:
|
||
"""Execute a query plan and return a flat ``key → {value, unit}`` map.
|
||
|
||
Queries that return no data (NaN, empty result) are still included in
|
||
the output with ``value: null`` — the comparator treats missing values
|
||
as "not yet observed" rather than as a regression. This keeps the
|
||
baseline schema stable across runs with different load levels.
|
||
|
||
Args:
|
||
session: Shared aiohttp session.
|
||
prom_url: Base URL of Prometheus (e.g. ``http://localhost:9090``).
|
||
plan: Output of :func:`build_query_plan`.
|
||
|
||
Returns:
|
||
Mapping from metric key to ``{"value": float|None, "unit": str}``.
|
||
"""
|
||
results: dict[str, dict[str, Any]] = {}
|
||
for entry in plan:
|
||
value = await _instant_query(session, prom_url, entry.promql)
|
||
results[entry.key] = {"value": value, "unit": entry.unit}
|
||
return results
|
||
|
||
|
||
async def _instant_query(
|
||
session: aiohttp.ClientSession,
|
||
prom_url: str,
|
||
promql: str,
|
||
) -> float | None:
|
||
"""POST an instant query to Prometheus; return the scalar value or None.
|
||
|
||
None is returned for NaN, empty results, or HTTP errors — every call
|
||
site treats None identically ("no data captured").
|
||
"""
|
||
url = f"{prom_url.rstrip('/')}/api/v1/query"
|
||
try:
|
||
async with session.post(
|
||
url, data={"query": promql}, timeout=aiohttp.ClientTimeout(total=30)
|
||
) as resp:
|
||
if resp.status != 200:
|
||
logger.warning("query HTTP %d: %s", resp.status, promql)
|
||
return None
|
||
body = await resp.json()
|
||
except (aiohttp.ClientError, TimeoutError) as exc:
|
||
logger.warning("query failed: %s — %s", promql, exc)
|
||
return None
|
||
|
||
if body.get("status") != "success":
|
||
logger.warning("query status=%s: %s", body.get("status"), promql)
|
||
return None
|
||
|
||
result = body.get("data", {}).get("result", [])
|
||
if not result:
|
||
return None
|
||
|
||
raw = result[0].get("value", [None, None])[1]
|
||
if raw is None or raw in ("NaN", "+Inf", "-Inf"):
|
||
return None
|
||
try:
|
||
return float(raw)
|
||
except (TypeError, ValueError):
|
||
return None
|
||
|
||
|
||
def _format_quantile(q: float) -> str:
|
||
"""Format a quantile for PromQL (``0.99`` → ``"0.99"``)."""
|
||
return f"{q:g}"
|
||
|
||
|
||
def _quantile_label(q: float) -> str:
|
||
"""Format a quantile for the output key (``0.95`` → ``"95"``)."""
|
||
return str(int(round(q * 100)))
|