mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-06 02:07:07 +00:00
The Phase 10 validation harness had drifted from the code's recording surface and the telemetry-validation CI job was failing before it could build. CI fix (telemetry-validation.yml): - Replace nonexistent local action ./.github/actions/print-env with the remote XRPLF/actions/print-build-env (the build-xrpld job failed in 56s on this). - Sync prepare-runner and upload-artifact action SHAs to the canonical workflow. Recording-surface reconciliation (docker/telemetry/workload/): - Migrate span attributes from dotted xrpl.<domain>.<field> to the bare/underscore form introduced by the 2026-05-13 span-attr naming redesign (tx_hash, peer_id, ledger_seq, consensus_mode, consensus_round, full_validation, quorum, ...). Dotted xrpl.ledger.hash is retained only on peer.validation.receive (shared constant), while consensus.validation.send uses bare ledger_hash. - Fix attribute placement: tx.apply carries tx_count/tx_failed (not ledger_seq); ledger.build carries ledger_seq/close_* (not tx_count/tx_failed). - Replace the phantom rpc.request span with the real WS root rpc.ws_message; drop the never-emitted duration_ms; rebuild the parent-child map accordingly. - Add the new spans the code emits: apply-pipeline stage spans (tx.preflight/preclaim/transactor with stage/tx_type/ter_result), txq.*, consensus sub-spans (round/establish/update_positions/check/phase.open), ledger.acquire, grpc.*, pathfind.*. Conditional spans are marked optional so they are skipped (not failed) when the workload does not exercise them. - validate_telemetry.py: service.name and Loki job label rippled -> xrpld; fix PARITY_SPAN_ATTRS (rename the 4 real attrs, drop the 3 that are metrics not span attrs); add optional-span handling that skips missing optional spans while still validating attributes when present. - expected_metrics.json: rippled_ -> xrpld_ on all beast::insight/overlay metrics, xrpld_job_count, the 15 on-disk xrpld-* dashboard UIDs, and the real bare spanmetrics dimension labels. - regression-metrics.json + baseline-timings.json: rpc.request -> rpc.ws_message. Metrics pipeline fix: - Switch node [insight] config from server=statsd/prefix=rippled to server=otel + /v1/metrics endpoint + prefix=xrpld across run-full-validation.sh, xrpld-validator.cfg.template, benchmark.sh and the workload compose. The collector has no StatsD receiver, so system metrics only reach Prometheus over OTLP. Synthetic load for new spans: - Add ripple_path_find to the RPC load generator (drives pathfind.* spans). - Add a high-TPS txq-burst workload phase to force fee escalation (drives txq.*). All facts verified against the *SpanNames.h headers and a live xrpld node + collector (Tempo service.name=xrpld, tx.preflight attrs [stage,ter_result,tx_type], 279 xrpld_ Prometheus metrics and zero rippled_). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
151 lines
5.6 KiB
JSON
151 lines
5.6 KiB
JSON
{
|
|
"description": "Expected metric inventory for xrpld telemetry validation. Metric names use the xrpld_ prefix (the [insight] prefix and OTel resource service name). Sourced from the live Grafana dashboards and MetricsRegistry.cpp.",
|
|
"spanmetrics": {
|
|
"description": "SpanMetrics-derived RED metrics from the OTel Collector spanmetrics connector.",
|
|
"metrics": [
|
|
"traces_span_metrics_calls_total",
|
|
"traces_span_metrics_duration_milliseconds_bucket",
|
|
"traces_span_metrics_duration_milliseconds_count",
|
|
"traces_span_metrics_duration_milliseconds_sum"
|
|
],
|
|
"required_labels": [
|
|
"span_name",
|
|
"status_code",
|
|
"service_name",
|
|
"span_kind"
|
|
],
|
|
"dimension_labels": [
|
|
"command",
|
|
"rpc_status",
|
|
"consensus_mode",
|
|
"local",
|
|
"proposal_trusted",
|
|
"validation_trusted",
|
|
"tx_type",
|
|
"ter_result",
|
|
"stage",
|
|
"txq_status",
|
|
"close_time_correct",
|
|
"consensus_state",
|
|
"suppressed"
|
|
],
|
|
"_dimension_labels_note": "Bare label names as configured in otel-collector-config.yaml spanmetrics dimensions. Informational only (not asserted by the validator)."
|
|
},
|
|
"statsd_gauges": {
|
|
"description": "beast::insight gauges exported via OTLP/HTTP to the collector (server=otel).",
|
|
"metrics": [
|
|
"xrpld_LedgerMaster_Validated_Ledger_Age",
|
|
"xrpld_LedgerMaster_Published_Ledger_Age",
|
|
"xrpld_State_Accounting_Full_duration",
|
|
"xrpld_Peer_Finder_Active_Inbound_Peers",
|
|
"xrpld_Peer_Finder_Active_Outbound_Peers",
|
|
"xrpld_job_count"
|
|
]
|
|
},
|
|
"statsd_counters": {
|
|
"description": "beast::insight counters exported via OTLP/HTTP. The OTel Prometheus exporter appends _total to monotonic counters.",
|
|
"metrics": ["xrpld_rpc_requests_total", "xrpld_ledger_fetches_total"]
|
|
},
|
|
"statsd_histograms": {
|
|
"description": "beast::insight timers/histograms exported via OTLP/HTTP.",
|
|
"metrics": ["xrpld_rpc_time", "xrpld_rpc_size"]
|
|
},
|
|
"overlay_traffic": {
|
|
"description": "Overlay traffic metrics (subset — full list has 45+ categories).",
|
|
"metrics": [
|
|
"xrpld_total_Bytes_In",
|
|
"xrpld_total_Bytes_Out",
|
|
"xrpld_total_Messages_In",
|
|
"xrpld_total_Messages_Out"
|
|
]
|
|
},
|
|
"phase9_nodestore": {
|
|
"description": "Phase 9 NodeStore I/O observable gauge (MetricsRegistry via OTLP). Single metric with 'metric' label distinguishing sub-metrics.",
|
|
"metrics": ["xrpld_nodestore_state"]
|
|
},
|
|
"phase9_cache": {
|
|
"description": "Phase 9 cache hit rate observable gauge (MetricsRegistry via OTLP). Single metric with 'metric' label.",
|
|
"metrics": ["xrpld_cache_metrics"]
|
|
},
|
|
"phase9_txq": {
|
|
"description": "Phase 9 transaction queue observable gauge (MetricsRegistry via OTLP). Single metric with 'metric' label.",
|
|
"metrics": ["xrpld_txq_metrics"]
|
|
},
|
|
"phase9_rpc_method": {
|
|
"description": "Phase 9 per-RPC-method counters (MetricsRegistry via OTLP).",
|
|
"metrics": ["xrpld_rpc_method_started_total"]
|
|
},
|
|
"phase9_objects": {
|
|
"description": "Phase 9 counted object instances observable gauge (MetricsRegistry via OTLP).",
|
|
"metrics": ["xrpld_object_count"]
|
|
},
|
|
"phase9_load": {
|
|
"description": "Phase 9 fee escalation and load factor observable gauge (MetricsRegistry via OTLP).",
|
|
"metrics": ["xrpld_load_factor_metrics"]
|
|
},
|
|
"parity_validation_agreement": {
|
|
"description": "External dashboard parity: validation agreement percentages (MetricsRegistry).",
|
|
"metrics": [
|
|
"xrpld_validation_agreement{metric=\"agreement_pct_1h\"}",
|
|
"xrpld_validation_agreement{metric=\"agreement_pct_24h\"}"
|
|
]
|
|
},
|
|
"parity_validator_health": {
|
|
"description": "External dashboard parity: validator health indicators (MetricsRegistry).",
|
|
"metrics": [
|
|
"xrpld_validator_health{metric=\"amendment_blocked\"}",
|
|
"xrpld_validator_health{metric=\"unl_expiry_days\"}"
|
|
]
|
|
},
|
|
"parity_peer_quality": {
|
|
"description": "External dashboard parity: peer quality metrics (MetricsRegistry).",
|
|
"metrics": [
|
|
"xrpld_peer_quality{metric=\"peer_latency_p90_ms\"}",
|
|
"xrpld_peer_quality{metric=\"peers_insane_count\"}"
|
|
]
|
|
},
|
|
"parity_ledger_economy": {
|
|
"description": "External dashboard parity: ledger economy metrics (MetricsRegistry).",
|
|
"metrics": [
|
|
"xrpld_ledger_economy{metric=\"base_fee_xrp\"}",
|
|
"xrpld_ledger_economy{metric=\"transaction_rate\"}"
|
|
]
|
|
},
|
|
"parity_state_tracking": {
|
|
"description": "External dashboard parity: server state tracking (MetricsRegistry).",
|
|
"metrics": ["xrpld_state_tracking{metric=\"state_value\"}"]
|
|
},
|
|
"parity_counters": {
|
|
"description": "External dashboard parity: monotonic counters (MetricsRegistry).",
|
|
"metrics": [
|
|
"xrpld_ledgers_closed_total",
|
|
"xrpld_validations_sent_total",
|
|
"xrpld_state_changes_total"
|
|
]
|
|
},
|
|
"parity_storage": {
|
|
"description": "External dashboard parity: storage detail metrics (MetricsRegistry).",
|
|
"metrics": ["xrpld_storage_detail{metric=\"nudb_bytes\"}"]
|
|
},
|
|
"grafana_dashboards": {
|
|
"description": "All Grafana dashboards that must render data (UIDs as provisioned on disk under docker/telemetry/grafana/dashboards/).",
|
|
"uids": [
|
|
"xrpld-rpc-perf",
|
|
"xrpld-rpc-perf-otel",
|
|
"xrpld-transactions",
|
|
"xrpld-consensus",
|
|
"xrpld-ledger-ops",
|
|
"xrpld-peer-net",
|
|
"xrpld-peer-quality",
|
|
"xrpld-fee-market",
|
|
"xrpld-job-queue",
|
|
"xrpld-validator-health",
|
|
"xrpld-system-node-health",
|
|
"xrpld-system-network",
|
|
"xrpld-system-rpc",
|
|
"xrpld-system-overlay-detail",
|
|
"xrpld-system-ledger-sync"
|
|
]
|
|
}
|
|
}
|