feat: add OTel-driven regression gate for Phase 10 telemetry validation

Captures per-span / per-RPC / per-job timings from Prometheus after the
workload run and diffs them against a committed baseline. Regression
requires breaching both a percentage and an absolute bound, tolerating
small-value noise. When the baseline is a placeholder, the comparator
emits the captured JSON in the exact schema for one-time paste into
baselines/baseline-timings.json, and the CI Step Summary surfaces that
block for the reviewer.

Scope: gate only — automated baseline persistence, benchmark.sh
PromQL migration, and the historical trend dashboard remain follow-ups.
This commit is contained in:
Pratik Mankawde
2026-04-24 18:53:44 +01:00
parent 8583343fd9
commit df79d5e74b
12 changed files with 1149 additions and 42 deletions

View File

@@ -230,6 +230,58 @@ jobs:
fi
fi
# Publishes captured OTel timings + regression report to the Step Summary.
# When the committed baseline is a placeholder, emits a fenced JSON block
# that can be copy-pasted directly into baselines/baseline-timings.json.
# When the baseline is populated, summarises the top regressions so the
# PR author sees the failure reason without downloading artifacts.
- name: Print regression summary
if: always()
run: |
TIMINGS="/tmp/xrpld-validation/reports/timings.json"
REGRESSION="/tmp/xrpld-validation/reports/regression-report.json"
BASELINE="docker/telemetry/workload/baselines/baseline-timings.json"
if [ ! -f "$TIMINGS" ]; then
echo "## Regression Gate: no timings captured" >> "$GITHUB_STEP_SUMMARY"
exit 0
fi
IS_PLACEHOLDER=$(jq -r '.placeholder == true or (.metrics | length == 0)' "$BASELINE")
echo "## OTel Timings Regression Gate" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
if [ "$IS_PLACEHOLDER" = "true" ]; then
echo "### Paste into \`baselines/baseline-timings.json\`" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "The committed baseline is a placeholder. Open a PR replacing" \
"its contents with the JSON block below to activate the" \
"regression gate." >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo '```json' >> "$GITHUB_STEP_SUMMARY"
cat "$TIMINGS" >> "$GITHUB_STEP_SUMMARY"
echo '```' >> "$GITHUB_STEP_SUMMARY"
elif [ -f "$REGRESSION" ]; then
REGR_COUNT=$(jq '.summary.regressions' "$REGRESSION")
IMPR_COUNT=$(jq '.summary.improvements' "$REGRESSION")
TOTAL=$(jq '.summary.total' "$REGRESSION")
echo "| Stat | Count |" >> "$GITHUB_STEP_SUMMARY"
echo "|------|-------|" >> "$GITHUB_STEP_SUMMARY"
echo "| Metrics compared | $TOTAL |" >> "$GITHUB_STEP_SUMMARY"
echo "| Regressions | $REGR_COUNT |" >> "$GITHUB_STEP_SUMMARY"
echo "| Improvements | $IMPR_COUNT |" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
if [ "$REGR_COUNT" -gt 0 ]; then
echo "### Regressions" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Metric | Baseline | Current | Δ | % | Unit |" >> "$GITHUB_STEP_SUMMARY"
echo "|--------|---------:|--------:|--:|--:|------|" >> "$GITHUB_STEP_SUMMARY"
jq -r '.metrics[] | select(.regressed) | "| \(.key) | \(.baseline) | \(.current) | \(.delta) | \(.pct_change)% | \(.unit) |"' \
"$REGRESSION" >> "$GITHUB_STEP_SUMMARY"
fi
fi
- name: Cleanup
if: always()
run: |