feat(telemetry): add external dashboard parity validation checks (Task 10.8)

Add ~28 validation checks for external dashboard parity:
- 8 span attribute checks (server_info, tx.receive, consensus, peer spans)
- 13 metric existence checks (validation agreement, validator health,
  peer quality, ledger economy, state tracking, counters, storage)
- 3 dashboard load checks (validator-health, peer-quality, system-node-health)
- 4 value sanity checks (agreement %, UNL expiry, latency, state value)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-03-31 12:07:55 +01:00
parent 898d05de66
commit 711ae43174
2 changed files with 276 additions and 3 deletions

View File

@@ -75,8 +75,52 @@
"description": "Phase 9 fee escalation and load factor observable gauge (MetricsRegistry via OTLP).",
"metrics": ["load_factor_metrics"]
},
"parity_validation_agreement": {
"description": "External dashboard parity: validation agreement percentages (push_metrics.py).",
"metrics": [
"rippled_validation_agreement{metric=\"agreement_pct_1h\"}",
"rippled_validation_agreement{metric=\"agreement_pct_24h\"}"
]
},
"parity_validator_health": {
"description": "External dashboard parity: validator health indicators (push_metrics.py).",
"metrics": [
"rippled_validator_health{metric=\"amendment_blocked\"}",
"rippled_validator_health{metric=\"unl_expiry_days\"}"
]
},
"parity_peer_quality": {
"description": "External dashboard parity: peer quality metrics (push_metrics.py).",
"metrics": [
"rippled_peer_quality{metric=\"peer_latency_p90_ms\"}",
"rippled_peer_quality{metric=\"peers_insane_count\"}"
]
},
"parity_ledger_economy": {
"description": "External dashboard parity: ledger economy metrics (push_metrics.py).",
"metrics": [
"rippled_ledger_economy{metric=\"base_fee_xrp\"}",
"rippled_ledger_economy{metric=\"transaction_rate\"}"
]
},
"parity_state_tracking": {
"description": "External dashboard parity: server state tracking (push_metrics.py).",
"metrics": ["rippled_state_tracking{metric=\"state_value\"}"]
},
"parity_counters": {
"description": "External dashboard parity: monotonic counters (push_metrics.py).",
"metrics": [
"rippled_ledgers_closed_total",
"rippled_validations_sent_total",
"rippled_state_changes_total"
]
},
"parity_storage": {
"description": "External dashboard parity: storage detail metrics (push_metrics.py).",
"metrics": ["rippled_storage_detail{metric=\"nudb_bytes\"}"]
},
"grafana_dashboards": {
"description": "All 10 Grafana dashboards that must render data.",
"description": "All 13 Grafana dashboards that must render data.",
"uids": [
"rippled-rpc-perf",
"rippled-transactions",
@@ -87,7 +131,10 @@
"rippled-system-network",
"rippled-system-rpc",
"rippled-system-overlay-detail",
"rippled-system-ledger-sync"
"rippled-system-ledger-sync",
"rippled-validator-health",
"rippled-peer-quality",
"system-node-health"
]
}
}

View File

@@ -9,7 +9,10 @@ Validation categories:
1. Span validation — All 16+ span types present with required attributes
2. Metric validation — SpanMetrics, StatsD, and Phase 9 metrics are non-zero
3. Log-trace correlation — Loki logs contain trace_id/span_id fields
4. Dashboard validation — All 10 Grafana dashboards render data
4. Dashboard validation — All 13 Grafana dashboards render data
5. External parity — Span attrs, metric existence, and value sanity for
external dashboard parity (validator-health,
peer-quality, system-node-health)
Usage:
python3 validate_telemetry.py --report /tmp/validation-report.json
@@ -791,6 +794,227 @@ async def validate_span_durations(
)
# ---------------------------------------------------------------------------
# External Dashboard Parity Validation
# ---------------------------------------------------------------------------
# Span attributes that external dashboards (validator-health, peer-quality,
# system-node-health) depend on. Each entry maps a span name to the
# attributes that must be present for external dashboard panels to render.
PARITY_SPAN_ATTRS: list[dict[str, str]] = [
{"span": "rpc.command.server_info", "attr": "xrpl.node.amendment_blocked"},
{"span": "rpc.command.server_info", "attr": "xrpl.node.server_state"},
{"span": "tx.receive", "attr": "xrpl.peer.version"},
{"span": "consensus.validation.send", "attr": "xrpl.validation.ledger_hash"},
{"span": "consensus.validation.send", "attr": "xrpl.validation.full"},
{"span": "peer.validation.receive", "attr": "xrpl.peer.validation.ledger_hash"},
{"span": "consensus.accept", "attr": "xrpl.consensus.validation_quorum"},
{"span": "consensus.accept", "attr": "xrpl.consensus.proposers_validated"},
]
# Value sanity bounds for external-parity metrics. Each entry specifies a
# Prometheus query and the acceptable range [lo, hi] for the returned value.
PARITY_VALUE_SANITY: list[dict[str, Any]] = [
{
"name": "validation_agreement_pct_1h",
"query": 'rippled_validation_agreement{metric="agreement_pct_1h"}',
"lo": 0,
"hi": 100,
},
{
"name": "unl_expiry_days",
"query": 'rippled_validator_health{metric="unl_expiry_days"}',
"lo": 0,
"hi": None,
"exclusive_lo": True,
},
{
"name": "peer_latency_p90_ms",
"query": 'rippled_peer_quality{metric="peer_latency_p90_ms"}',
"lo": 0,
"hi": None,
"exclusive_lo": True,
},
{
"name": "state_value",
"query": 'rippled_state_tracking{metric="state_value"}',
"lo": 0,
"hi": 7,
},
]
async def validate_parity_span_attrs(
session: aiohttp.ClientSession,
jaeger_url: str,
report: ValidationReport,
) -> None:
"""Validate span attributes required by external dashboard panels.
For each (span, attribute) pair in PARITY_SPAN_ATTRS, queries Jaeger
for the span and checks that the attribute key exists on at least one
span in the returned traces.
Args:
session: aiohttp client session.
jaeger_url: Base URL for Jaeger API.
report: ValidationReport to accumulate results.
"""
logger.info("--- External Parity: Span Attribute Checks ---")
for entry in PARITY_SPAN_ATTRS:
span_name = entry["span"]
attr_name = entry["attr"]
check_name = f"parity.span_attr.{span_name}.{attr_name}"
try:
params = {
"service": "rippled",
"operation": span_name,
"limit": 5,
"lookback": "1h",
}
async with session.get(f"{jaeger_url}/api/traces", params=params) as resp:
data = await resp.json()
traces = data.get("data", [])
if not traces:
report.add(
CheckResult(
name=check_name,
category="parity",
passed=False,
message=(
f"{span_name}: no traces found, "
f"cannot verify attr {attr_name}"
),
)
)
continue
# Search all spans across returned traces for the attribute.
found = False
for trace in traces:
for span in trace.get("spans", []):
for tag in span.get("tags", []):
if tag.get("key") == attr_name:
found = True
break
if found:
break
if found:
break
report.add(
CheckResult(
name=check_name,
category="parity",
passed=found,
message=(
f"{span_name}: attribute '{attr_name}' present"
if found
else f"{span_name}: attribute '{attr_name}' missing"
),
)
)
except Exception as exc:
report.add(
CheckResult(
name=check_name,
category="parity",
passed=False,
message=f"{span_name}: attr check failed ({exc})",
)
)
async def validate_parity_value_sanity(
session: aiohttp.ClientSession,
prometheus_url: str,
report: ValidationReport,
) -> None:
"""Validate that external-parity metric values fall within sane bounds.
For each entry in PARITY_VALUE_SANITY, queries the current value from
Prometheus and checks it against the specified [lo, hi] range.
Args:
session: aiohttp client session.
prometheus_url: Prometheus API base URL.
report: ValidationReport to accumulate results.
"""
logger.info("--- External Parity: Value Sanity Checks ---")
for entry in PARITY_VALUE_SANITY:
name = entry["name"]
query = entry["query"]
lo = entry["lo"]
hi = entry["hi"]
exclusive_lo = entry.get("exclusive_lo", False)
check_name = f"parity.value_sanity.{name}"
try:
params = {"query": query}
async with session.get(
f"{prometheus_url}/api/v1/query", params=params
) as resp:
data = await resp.json()
results = data.get("data", {}).get("result", [])
if not results:
report.add(
CheckResult(
name=check_name,
category="parity",
passed=False,
message=f"{name}: no data returned from Prometheus",
)
)
continue
# Use the first result's value.
value = float(results[0]["value"][1])
# Check bounds.
in_range = True
if exclusive_lo:
in_range = in_range and (value > lo)
else:
in_range = in_range and (value >= lo)
if hi is not None:
in_range = in_range and (value <= hi)
# Build human-readable bound description.
lo_op = ">" if exclusive_lo else ">="
bound_desc = f"{lo_op} {lo}"
if hi is not None:
bound_desc += f" and <= {hi}"
report.add(
CheckResult(
name=check_name,
category="parity",
passed=in_range,
message=(
f"{name}: value {value} is within bounds ({bound_desc})"
if in_range
else f"{name}: value {value} out of bounds "
f"(expected {bound_desc})"
),
details={"value": value, "lo": lo, "hi": hi},
)
)
except Exception as exc:
report.add(
CheckResult(
name=check_name,
category="parity",
passed=False,
message=f"{name}: sanity check failed ({exc})",
)
)
# ---------------------------------------------------------------------------
# Main validation orchestrator
# ---------------------------------------------------------------------------
@@ -825,6 +1049,8 @@ async def run_validation(
if not skip_loki:
await validate_log_trace_correlation(session, loki_url, jaeger_url, report)
await validate_dashboards(session, grafana_url, report)
await validate_parity_span_attrs(session, jaeger_url, report)
await validate_parity_value_sanity(session, prometheus_url, report)
report.end_time = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
return report