mirror of
https://github.com/XRPLF/rippled.git
synced 2026-04-29 15:37:57 +00:00
fix: address review findings in regression gate
- capture_timings.py: fail when captured/total ratio < 50% (--min-capture-ratio). Prevents silent pass on unreachable Prometheus. - run-full-validation.sh: set REGRESSION_EXIT=2 on capture failure so the final exit code reflects it. Update exit code docs in header. - compare_to_baseline.py: extract _skip_delta helper to bring compute_delta under 80 lines. Fix 0.0-as-falsy bug in abs_bound resolution (use explicit None check instead of `or`). Remove dead variable override_prefix_key. - prom_queries.py: extract _build_simple_entries and _build_job_entries to bring build_query_plan under 80 lines. Fix module docstring return type example. Use aiohttp.ClientTimeout instead of bare int. - telemetry-validation.yml: add set -euo pipefail to regression summary step; guard jq calls with -e flag and fallback; fail on missing baseline file; emit ::warning annotation when timings.json missing. - baselines/README.md: document the placeholder field.
This commit is contained in:
19
.github/workflows/telemetry-validation.yml
vendored
19
.github/workflows/telemetry-validation.yml
vendored
@@ -238,16 +238,27 @@ jobs:
|
||||
- name: Print regression summary
|
||||
if: always()
|
||||
run: |
|
||||
set -euo pipefail
|
||||
TIMINGS="/tmp/xrpld-validation/reports/timings.json"
|
||||
REGRESSION="/tmp/xrpld-validation/reports/regression-report.json"
|
||||
BASELINE="docker/telemetry/workload/baselines/baseline-timings.json"
|
||||
|
||||
if [ ! -f "$TIMINGS" ]; then
|
||||
echo "## Regression Gate: no timings captured" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "::warning::capture_timings.py did not produce timings.json — regression gate was not evaluated."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
IS_PLACEHOLDER=$(jq -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE")
|
||||
if [ ! -f "$BASELINE" ]; then
|
||||
echo "## Regression Gate: baseline file missing" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "::error::baselines/baseline-timings.json not found in checkout"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
IS_PLACEHOLDER=$(jq -e -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE") || {
|
||||
echo "::error::Failed to parse baseline JSON"
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "## OTel Timings Regression Gate" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "" >> "$GITHUB_STEP_SUMMARY"
|
||||
@@ -263,9 +274,9 @@ jobs:
|
||||
cat "$TIMINGS" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo '```' >> "$GITHUB_STEP_SUMMARY"
|
||||
elif [ -f "$REGRESSION" ]; then
|
||||
REGR_COUNT=$(jq '.summary.regressions' "$REGRESSION")
|
||||
IMPR_COUNT=$(jq '.summary.improvements' "$REGRESSION")
|
||||
TOTAL=$(jq '.summary.total' "$REGRESSION")
|
||||
REGR_COUNT=$(jq -e '.summary.regressions' "$REGRESSION") || REGR_COUNT=0
|
||||
IMPR_COUNT=$(jq -e '.summary.improvements' "$REGRESSION") || IMPR_COUNT=0
|
||||
TOTAL=$(jq -e '.summary.total' "$REGRESSION") || TOTAL=0
|
||||
echo "| Stat | Count |" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "|------|-------|" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "| Metrics compared | $TOTAL |" >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
@@ -62,6 +62,11 @@ should trace back to a real CI run so variance characteristics are preserved.
|
||||
}
|
||||
```
|
||||
|
||||
Placeholder baselines additionally include `"placeholder": true`. The comparator
|
||||
detects this field (or an empty `metrics` object) to switch into "populate" mode
|
||||
instead of enforcing thresholds. Remove the `placeholder` key when pasting real
|
||||
captured timings.
|
||||
|
||||
Missing metrics (value `null`) in a captured run do not count as regressions — they
|
||||
are reported separately in `regression-report.json` under `missing_in_current`.
|
||||
This keeps the gate robust when a profile doesn't exercise every span on every run.
|
||||
|
||||
@@ -131,6 +131,12 @@ def main() -> int:
|
||||
default="regression",
|
||||
help="Workload profile used during capture (metadata only)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-capture-ratio",
|
||||
type=float,
|
||||
default=0.5,
|
||||
help="Fail if fewer than this fraction of metrics are captured (default: 0.5)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
@@ -160,6 +166,18 @@ def main() -> int:
|
||||
captured = sum(1 for v in report["metrics"].values() if v["value"] is not None)
|
||||
total = len(report["metrics"])
|
||||
logger.info("Wrote %s (%d/%d metrics captured)", args.output, captured, total)
|
||||
|
||||
if total > 0 and (captured / total) < args.min_capture_ratio:
|
||||
logger.error(
|
||||
"Only %d/%d (%.0f%%) metrics captured — below the %.0f%% minimum. "
|
||||
"Is Prometheus reachable at %s?",
|
||||
captured,
|
||||
total,
|
||||
captured / total * 100,
|
||||
args.min_capture_ratio * 100,
|
||||
args.prometheus,
|
||||
)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
@@ -134,7 +134,6 @@ def resolve_thresholds(
|
||||
if cat is None:
|
||||
return (None, None)
|
||||
|
||||
override_prefix_key = ".".join(parts[:-1])
|
||||
override_key = f"{category_key}.{'.'.join(parts[1:-1])}"
|
||||
overrides = thresholds.get("overrides", {})
|
||||
defaults = thresholds.get("defaults", {}).get(cat, {})
|
||||
@@ -146,10 +145,36 @@ def resolve_thresholds(
|
||||
return (None, None)
|
||||
|
||||
pct = rule.get("max_pct_increase")
|
||||
abs_bound = rule.get("max_abs_increase_ms") or rule.get("max_abs_increase_us")
|
||||
abs_bound = rule.get("max_abs_increase_ms")
|
||||
if abs_bound is None:
|
||||
abs_bound = rule.get("max_abs_increase_us")
|
||||
return (pct, abs_bound)
|
||||
|
||||
|
||||
def _skip_delta(
|
||||
key: str,
|
||||
baseline: float | None,
|
||||
current: float | None,
|
||||
unit: str,
|
||||
thresholds: dict,
|
||||
note: str,
|
||||
) -> MetricDelta:
|
||||
"""Build a MetricDelta for cases where comparison is not possible."""
|
||||
pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
|
||||
return MetricDelta(
|
||||
key=key,
|
||||
baseline=baseline,
|
||||
current=current,
|
||||
delta=None,
|
||||
pct_change=None,
|
||||
unit=unit,
|
||||
threshold_pct=pct_threshold,
|
||||
threshold_abs=abs_threshold,
|
||||
regressed=False,
|
||||
note=note,
|
||||
)
|
||||
|
||||
|
||||
def compute_delta(
|
||||
key: str,
|
||||
baseline_entry: dict | None,
|
||||
@@ -166,50 +191,22 @@ def compute_delta(
|
||||
current = current_entry.get("value") if current_entry else None
|
||||
unit = (baseline_entry or current_entry or {}).get("unit", "")
|
||||
|
||||
pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
|
||||
|
||||
if baseline is None and current is None:
|
||||
return MetricDelta(
|
||||
key=key,
|
||||
baseline=None,
|
||||
current=None,
|
||||
delta=None,
|
||||
pct_change=None,
|
||||
unit=unit,
|
||||
threshold_pct=pct_threshold,
|
||||
threshold_abs=abs_threshold,
|
||||
regressed=False,
|
||||
note="no data (neither baseline nor current)",
|
||||
return _skip_delta(
|
||||
key, None, None, unit, thresholds, "no data (neither baseline nor current)"
|
||||
)
|
||||
|
||||
if baseline is None:
|
||||
return MetricDelta(
|
||||
key=key,
|
||||
baseline=None,
|
||||
current=current,
|
||||
delta=None,
|
||||
pct_change=None,
|
||||
unit=unit,
|
||||
threshold_pct=pct_threshold,
|
||||
threshold_abs=abs_threshold,
|
||||
regressed=False,
|
||||
note="new metric (not in baseline)",
|
||||
return _skip_delta(
|
||||
key, None, current, unit, thresholds, "new metric (not in baseline)"
|
||||
)
|
||||
|
||||
if current is None:
|
||||
return MetricDelta(
|
||||
key=key,
|
||||
baseline=baseline,
|
||||
current=None,
|
||||
delta=None,
|
||||
pct_change=None,
|
||||
unit=unit,
|
||||
threshold_pct=pct_threshold,
|
||||
threshold_abs=abs_threshold,
|
||||
regressed=False,
|
||||
note="not captured in current run",
|
||||
return _skip_delta(
|
||||
key, baseline, None, unit, thresholds, "not captured in current run"
|
||||
)
|
||||
|
||||
pct_threshold, abs_threshold = resolve_thresholds(key, thresholds)
|
||||
delta = current - baseline
|
||||
pct_change = (delta / baseline * 100.0) if baseline > 0 else None
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ Usage::
|
||||
plan = build_query_plan("regression-metrics.json", window="3m")
|
||||
async with aiohttp.ClientSession() as s:
|
||||
timings = await run_query_plan(s, "http://localhost:9090", plan)
|
||||
# timings = {"span.tx.process.p99": 12.4, ...}
|
||||
# timings = {"span.tx.process.p99": {"value": 12.4, "unit": "ms"}, ...}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -56,6 +56,62 @@ class QueryEntry:
|
||||
unit: str
|
||||
|
||||
|
||||
def _build_simple_entries(
|
||||
cfg: dict,
|
||||
prefix: str,
|
||||
window: str,
|
||||
) -> list[QueryEntry]:
|
||||
"""Build QueryEntry list for a single-template category (spans, rpc)."""
|
||||
tmpl = cfg.get("_query_template", "")
|
||||
unit = cfg.get("_unit", "ms")
|
||||
entries: list[QueryEntry] = []
|
||||
for name in cfg.get("names", []):
|
||||
for q in cfg.get("_quantiles", []):
|
||||
expr = (
|
||||
tmpl.replace("{quantile}", _format_quantile(q))
|
||||
.replace("{name}", name)
|
||||
.replace("{window}", window)
|
||||
)
|
||||
entries.append(
|
||||
QueryEntry(
|
||||
key=f"{prefix}.{name}.p{_quantile_label(q)}",
|
||||
promql=expr,
|
||||
unit=unit,
|
||||
)
|
||||
)
|
||||
return entries
|
||||
|
||||
|
||||
def _build_job_entries(cfg: dict, window: str) -> list[QueryEntry]:
|
||||
"""Build QueryEntry list for the job_queue category (multi-phase)."""
|
||||
unit = cfg.get("_unit", "us")
|
||||
phases = cfg.get("_phases", ["queued", "running"])
|
||||
tmpl_map = {
|
||||
"queued": cfg.get("_queued_template", ""),
|
||||
"running": cfg.get("_running_template", ""),
|
||||
}
|
||||
entries: list[QueryEntry] = []
|
||||
for name in cfg.get("names", []):
|
||||
for phase in phases:
|
||||
tmpl = tmpl_map.get(phase, "")
|
||||
if not tmpl:
|
||||
continue
|
||||
for q in cfg.get("_quantiles", []):
|
||||
expr = (
|
||||
tmpl.replace("{quantile}", _format_quantile(q))
|
||||
.replace("{name}", name)
|
||||
.replace("{window}", window)
|
||||
)
|
||||
entries.append(
|
||||
QueryEntry(
|
||||
key=f"job.{name}.{phase}.p{_quantile_label(q)}",
|
||||
promql=expr,
|
||||
unit=unit,
|
||||
)
|
||||
)
|
||||
return entries
|
||||
|
||||
|
||||
def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[QueryEntry]:
|
||||
"""Translate regression-metrics.json into a list of PromQL queries.
|
||||
|
||||
@@ -73,69 +129,9 @@ def build_query_plan(metrics_path: str | Path, window: str = "3m") -> list[Query
|
||||
cfg = json.load(f)
|
||||
|
||||
plan: list[QueryEntry] = []
|
||||
|
||||
span_cfg = cfg.get("spans", {})
|
||||
tmpl = span_cfg.get("_query_template", "")
|
||||
unit = span_cfg.get("_unit", "ms")
|
||||
for name in span_cfg.get("names", []):
|
||||
for q in span_cfg.get("_quantiles", []):
|
||||
expr = (
|
||||
tmpl.replace("{quantile}", _format_quantile(q))
|
||||
.replace("{name}", name)
|
||||
.replace("{window}", window)
|
||||
)
|
||||
plan.append(
|
||||
QueryEntry(
|
||||
key=f"span.{name}.p{_quantile_label(q)}",
|
||||
promql=expr,
|
||||
unit=unit,
|
||||
)
|
||||
)
|
||||
|
||||
rpc_cfg = cfg.get("rpc_methods", {})
|
||||
tmpl = rpc_cfg.get("_query_template", "")
|
||||
unit = rpc_cfg.get("_unit", "us")
|
||||
for name in rpc_cfg.get("names", []):
|
||||
for q in rpc_cfg.get("_quantiles", []):
|
||||
expr = (
|
||||
tmpl.replace("{quantile}", _format_quantile(q))
|
||||
.replace("{name}", name)
|
||||
.replace("{window}", window)
|
||||
)
|
||||
plan.append(
|
||||
QueryEntry(
|
||||
key=f"rpc.{name}.p{_quantile_label(q)}",
|
||||
promql=expr,
|
||||
unit=unit,
|
||||
)
|
||||
)
|
||||
|
||||
job_cfg = cfg.get("job_queue", {})
|
||||
unit = job_cfg.get("_unit", "us")
|
||||
phases = job_cfg.get("_phases", ["queued", "running"])
|
||||
tmpl_map = {
|
||||
"queued": job_cfg.get("_queued_template", ""),
|
||||
"running": job_cfg.get("_running_template", ""),
|
||||
}
|
||||
for name in job_cfg.get("names", []):
|
||||
for phase in phases:
|
||||
tmpl = tmpl_map.get(phase, "")
|
||||
if not tmpl:
|
||||
continue
|
||||
for q in job_cfg.get("_quantiles", []):
|
||||
expr = (
|
||||
tmpl.replace("{quantile}", _format_quantile(q))
|
||||
.replace("{name}", name)
|
||||
.replace("{window}", window)
|
||||
)
|
||||
plan.append(
|
||||
QueryEntry(
|
||||
key=f"job.{name}.{phase}.p{_quantile_label(q)}",
|
||||
promql=expr,
|
||||
unit=unit,
|
||||
)
|
||||
)
|
||||
|
||||
plan.extend(_build_simple_entries(cfg.get("spans", {}), "span", window))
|
||||
plan.extend(_build_simple_entries(cfg.get("rpc_methods", {}), "rpc", window))
|
||||
plan.extend(_build_job_entries(cfg.get("job_queue", {}), window))
|
||||
return plan
|
||||
|
||||
|
||||
@@ -178,7 +174,9 @@ async def _instant_query(
|
||||
"""
|
||||
url = f"{prom_url.rstrip('/')}/api/v1/query"
|
||||
try:
|
||||
async with session.post(url, data={"query": promql}, timeout=30) as resp:
|
||||
async with session.post(
|
||||
url, data={"query": promql}, timeout=aiohttp.ClientTimeout(total=30)
|
||||
) as resp:
|
||||
if resp.status != 200:
|
||||
logger.warning("query HTTP %d: %s", resp.status, promql)
|
||||
return None
|
||||
|
||||
@@ -17,9 +17,9 @@
|
||||
# ./run-full-validation.sh --cleanup
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 — All validation checks passed
|
||||
# 1 — One or more validation checks failed
|
||||
# 2 — Infrastructure error (cluster/stack failed to start)
|
||||
# 0 — All validation checks and the regression gate passed
|
||||
# 1 — Validation checks failed OR the regression gate detected a regression
|
||||
# 2 — Infrastructure error (cluster/stack failed to start, timing capture failed)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -385,6 +385,7 @@ if [ "$SKIP_REGRESSION" != true ]; then
|
||||
ok "Timings captured: $TIMINGS_FILE"
|
||||
else
|
||||
fail "Failed to capture timings — skipping regression comparison."
|
||||
REGRESSION_EXIT=2
|
||||
SKIP_REGRESSION=true
|
||||
fi
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user