From 711ae43174dfda22ed5257e0555fdd678dbd3674 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Tue, 31 Mar 2026 12:07:55 +0100 Subject: [PATCH] feat(telemetry): add external dashboard parity validation checks (Task 10.8) Add ~28 validation checks for external dashboard parity: - 8 span attribute checks (server_info, tx.receive, consensus, peer spans) - 13 metric existence checks (validation agreement, validator health, peer quality, ledger economy, state tracking, counters, storage) - 3 dashboard load checks (validator-health, peer-quality, system-node-health) - 4 value sanity checks (agreement %, UNL expiry, latency, state value) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../telemetry/workload/expected_metrics.json | 51 +++- .../telemetry/workload/validate_telemetry.py | 228 +++++++++++++++++- 2 files changed, 276 insertions(+), 3 deletions(-) diff --git a/docker/telemetry/workload/expected_metrics.json b/docker/telemetry/workload/expected_metrics.json index f108944f0e..d79d2c776f 100644 --- a/docker/telemetry/workload/expected_metrics.json +++ b/docker/telemetry/workload/expected_metrics.json @@ -75,8 +75,52 @@ "description": "Phase 9 fee escalation and load factor observable gauge (MetricsRegistry via OTLP).", "metrics": ["load_factor_metrics"] }, + "parity_validation_agreement": { + "description": "External dashboard parity: validation agreement percentages (push_metrics.py).", + "metrics": [ + "rippled_validation_agreement{metric=\"agreement_pct_1h\"}", + "rippled_validation_agreement{metric=\"agreement_pct_24h\"}" + ] + }, + "parity_validator_health": { + "description": "External dashboard parity: validator health indicators (push_metrics.py).", + "metrics": [ + "rippled_validator_health{metric=\"amendment_blocked\"}", + "rippled_validator_health{metric=\"unl_expiry_days\"}" + ] + }, + "parity_peer_quality": { + "description": "External dashboard parity: peer quality metrics (push_metrics.py).", + "metrics": [ + "rippled_peer_quality{metric=\"peer_latency_p90_ms\"}", + "rippled_peer_quality{metric=\"peers_insane_count\"}" + ] + }, + "parity_ledger_economy": { + "description": "External dashboard parity: ledger economy metrics (push_metrics.py).", + "metrics": [ + "rippled_ledger_economy{metric=\"base_fee_xrp\"}", + "rippled_ledger_economy{metric=\"transaction_rate\"}" + ] + }, + "parity_state_tracking": { + "description": "External dashboard parity: server state tracking (push_metrics.py).", + "metrics": ["rippled_state_tracking{metric=\"state_value\"}"] + }, + "parity_counters": { + "description": "External dashboard parity: monotonic counters (push_metrics.py).", + "metrics": [ + "rippled_ledgers_closed_total", + "rippled_validations_sent_total", + "rippled_state_changes_total" + ] + }, + "parity_storage": { + "description": "External dashboard parity: storage detail metrics (push_metrics.py).", + "metrics": ["rippled_storage_detail{metric=\"nudb_bytes\"}"] + }, "grafana_dashboards": { - "description": "All 10 Grafana dashboards that must render data.", + "description": "All 13 Grafana dashboards that must render data.", "uids": [ "rippled-rpc-perf", "rippled-transactions", @@ -87,7 +131,10 @@ "rippled-system-network", "rippled-system-rpc", "rippled-system-overlay-detail", - "rippled-system-ledger-sync" + "rippled-system-ledger-sync", + "rippled-validator-health", + "rippled-peer-quality", + "system-node-health" ] } } diff --git a/docker/telemetry/workload/validate_telemetry.py b/docker/telemetry/workload/validate_telemetry.py index c1ec57cdc4..d4ba030782 100644 --- a/docker/telemetry/workload/validate_telemetry.py +++ b/docker/telemetry/workload/validate_telemetry.py @@ -9,7 +9,10 @@ Validation categories: 1. Span validation — All 16+ span types present with required attributes 2. Metric validation — SpanMetrics, StatsD, and Phase 9 metrics are non-zero 3. Log-trace correlation — Loki logs contain trace_id/span_id fields - 4. Dashboard validation — All 10 Grafana dashboards render data + 4. Dashboard validation — All 13 Grafana dashboards render data + 5. External parity — Span attrs, metric existence, and value sanity for + external dashboard parity (validator-health, + peer-quality, system-node-health) Usage: python3 validate_telemetry.py --report /tmp/validation-report.json @@ -791,6 +794,227 @@ async def validate_span_durations( ) +# --------------------------------------------------------------------------- +# External Dashboard Parity Validation +# --------------------------------------------------------------------------- + +# Span attributes that external dashboards (validator-health, peer-quality, +# system-node-health) depend on. Each entry maps a span name to the +# attributes that must be present for external dashboard panels to render. +PARITY_SPAN_ATTRS: list[dict[str, str]] = [ + {"span": "rpc.command.server_info", "attr": "xrpl.node.amendment_blocked"}, + {"span": "rpc.command.server_info", "attr": "xrpl.node.server_state"}, + {"span": "tx.receive", "attr": "xrpl.peer.version"}, + {"span": "consensus.validation.send", "attr": "xrpl.validation.ledger_hash"}, + {"span": "consensus.validation.send", "attr": "xrpl.validation.full"}, + {"span": "peer.validation.receive", "attr": "xrpl.peer.validation.ledger_hash"}, + {"span": "consensus.accept", "attr": "xrpl.consensus.validation_quorum"}, + {"span": "consensus.accept", "attr": "xrpl.consensus.proposers_validated"}, +] + +# Value sanity bounds for external-parity metrics. Each entry specifies a +# Prometheus query and the acceptable range [lo, hi] for the returned value. +PARITY_VALUE_SANITY: list[dict[str, Any]] = [ + { + "name": "validation_agreement_pct_1h", + "query": 'rippled_validation_agreement{metric="agreement_pct_1h"}', + "lo": 0, + "hi": 100, + }, + { + "name": "unl_expiry_days", + "query": 'rippled_validator_health{metric="unl_expiry_days"}', + "lo": 0, + "hi": None, + "exclusive_lo": True, + }, + { + "name": "peer_latency_p90_ms", + "query": 'rippled_peer_quality{metric="peer_latency_p90_ms"}', + "lo": 0, + "hi": None, + "exclusive_lo": True, + }, + { + "name": "state_value", + "query": 'rippled_state_tracking{metric="state_value"}', + "lo": 0, + "hi": 7, + }, +] + + +async def validate_parity_span_attrs( + session: aiohttp.ClientSession, + jaeger_url: str, + report: ValidationReport, +) -> None: + """Validate span attributes required by external dashboard panels. + + For each (span, attribute) pair in PARITY_SPAN_ATTRS, queries Jaeger + for the span and checks that the attribute key exists on at least one + span in the returned traces. + + Args: + session: aiohttp client session. + jaeger_url: Base URL for Jaeger API. + report: ValidationReport to accumulate results. + """ + logger.info("--- External Parity: Span Attribute Checks ---") + + for entry in PARITY_SPAN_ATTRS: + span_name = entry["span"] + attr_name = entry["attr"] + check_name = f"parity.span_attr.{span_name}.{attr_name}" + + try: + params = { + "service": "rippled", + "operation": span_name, + "limit": 5, + "lookback": "1h", + } + async with session.get(f"{jaeger_url}/api/traces", params=params) as resp: + data = await resp.json() + traces = data.get("data", []) + + if not traces: + report.add( + CheckResult( + name=check_name, + category="parity", + passed=False, + message=( + f"{span_name}: no traces found, " + f"cannot verify attr {attr_name}" + ), + ) + ) + continue + + # Search all spans across returned traces for the attribute. + found = False + for trace in traces: + for span in trace.get("spans", []): + for tag in span.get("tags", []): + if tag.get("key") == attr_name: + found = True + break + if found: + break + if found: + break + + report.add( + CheckResult( + name=check_name, + category="parity", + passed=found, + message=( + f"{span_name}: attribute '{attr_name}' present" + if found + else f"{span_name}: attribute '{attr_name}' missing" + ), + ) + ) + except Exception as exc: + report.add( + CheckResult( + name=check_name, + category="parity", + passed=False, + message=f"{span_name}: attr check failed ({exc})", + ) + ) + + +async def validate_parity_value_sanity( + session: aiohttp.ClientSession, + prometheus_url: str, + report: ValidationReport, +) -> None: + """Validate that external-parity metric values fall within sane bounds. + + For each entry in PARITY_VALUE_SANITY, queries the current value from + Prometheus and checks it against the specified [lo, hi] range. + + Args: + session: aiohttp client session. + prometheus_url: Prometheus API base URL. + report: ValidationReport to accumulate results. + """ + logger.info("--- External Parity: Value Sanity Checks ---") + + for entry in PARITY_VALUE_SANITY: + name = entry["name"] + query = entry["query"] + lo = entry["lo"] + hi = entry["hi"] + exclusive_lo = entry.get("exclusive_lo", False) + check_name = f"parity.value_sanity.{name}" + + try: + params = {"query": query} + async with session.get( + f"{prometheus_url}/api/v1/query", params=params + ) as resp: + data = await resp.json() + results = data.get("data", {}).get("result", []) + + if not results: + report.add( + CheckResult( + name=check_name, + category="parity", + passed=False, + message=f"{name}: no data returned from Prometheus", + ) + ) + continue + + # Use the first result's value. + value = float(results[0]["value"][1]) + + # Check bounds. + in_range = True + if exclusive_lo: + in_range = in_range and (value > lo) + else: + in_range = in_range and (value >= lo) + if hi is not None: + in_range = in_range and (value <= hi) + + # Build human-readable bound description. + lo_op = ">" if exclusive_lo else ">=" + bound_desc = f"{lo_op} {lo}" + if hi is not None: + bound_desc += f" and <= {hi}" + + report.add( + CheckResult( + name=check_name, + category="parity", + passed=in_range, + message=( + f"{name}: value {value} is within bounds ({bound_desc})" + if in_range + else f"{name}: value {value} out of bounds " + f"(expected {bound_desc})" + ), + details={"value": value, "lo": lo, "hi": hi}, + ) + ) + except Exception as exc: + report.add( + CheckResult( + name=check_name, + category="parity", + passed=False, + message=f"{name}: sanity check failed ({exc})", + ) + ) + + # --------------------------------------------------------------------------- # Main validation orchestrator # --------------------------------------------------------------------------- @@ -825,6 +1049,8 @@ async def run_validation( if not skip_loki: await validate_log_trace_correlation(session, loki_url, jaeger_url, report) await validate_dashboards(session, grafana_url, report) + await validate_parity_span_attrs(session, jaeger_url, report) + await validate_parity_value_sanity(session, prometheus_url, report) report.end_time = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) return report