Files
rippled/docker/telemetry/workload/expected_metrics.json
Pratik Mankawde fd1c8c6060 fix(telemetry): resolve Phase 10 validation failures surfaced by first full CI run
The print-env CI fix let the Telemetry Stack Validation job build and run the
workload harness end-to-end for the first time. It reported 129/136 checks
passing; this commit fixes the 7 real failures plus a latent regression-gate bug.

Validation-suite fixes (verified against the CI run's actual emission + live node):
- expected_metrics.json: the beast::insight job-depth gauge is `xrpld_jobq_job_count`,
  not `xrpld_job_count` (the latter is a Phase 9 OTel counter). Reverted the prior
  rename. Removed the statsd_histograms block (`xrpld_rpc_time`/`xrpld_rpc_size`):
  these RPC timers do not emit under the WS workload (0 series in CI).
- expected_spans.json: `tx_status` is only set on suppressed/known-bad receives, so
  it is no longer a required attribute of every `tx.receive`. Marked `pathfind.compute`
  and `pathfind.discover` optional and the `pathfind.request -> pathfind.compute`
  hierarchy as skip — the self-to-self XRP probe returns before computing paths in a
  fresh cluster with no liquidity, so only `pathfind.request` fires.

Regression-gate bug (telemetry-validation.yml "Print regression summary"):
- `jq -e` exits non-zero when its filter result is boolean false — the normal case
  for a populated (non-placeholder) baseline — which was misreported as
  "Failed to parse baseline JSON" and failed the job. Dropped `-e` (kept `-r`) so a
  non-zero exit genuinely means malformed JSON.

The optional-span handling and regression comparison both worked correctly in the
CI run (txq.* / pathfind.update_all skipped-when-absent, 0 regressions detected).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-05 19:26:30 +01:00

147 lines
5.4 KiB
JSON

{
"description": "Expected metric inventory for xrpld telemetry validation. Metric names use the xrpld_ prefix (the [insight] prefix and OTel resource service name). Sourced from the live Grafana dashboards and MetricsRegistry.cpp.",
"spanmetrics": {
"description": "SpanMetrics-derived RED metrics from the OTel Collector spanmetrics connector.",
"metrics": [
"traces_span_metrics_calls_total",
"traces_span_metrics_duration_milliseconds_bucket",
"traces_span_metrics_duration_milliseconds_count",
"traces_span_metrics_duration_milliseconds_sum"
],
"required_labels": [
"span_name",
"status_code",
"service_name",
"span_kind"
],
"dimension_labels": [
"command",
"rpc_status",
"consensus_mode",
"local",
"proposal_trusted",
"validation_trusted",
"tx_type",
"ter_result",
"stage",
"txq_status",
"close_time_correct",
"consensus_state",
"suppressed"
],
"_dimension_labels_note": "Bare label names as configured in otel-collector-config.yaml spanmetrics dimensions. Informational only (not asserted by the validator)."
},
"statsd_gauges": {
"description": "beast::insight gauges exported via OTLP/HTTP to the collector (server=otel).",
"metrics": [
"xrpld_LedgerMaster_Validated_Ledger_Age",
"xrpld_LedgerMaster_Published_Ledger_Age",
"xrpld_State_Accounting_Full_duration",
"xrpld_Peer_Finder_Active_Inbound_Peers",
"xrpld_Peer_Finder_Active_Outbound_Peers",
"xrpld_jobq_job_count"
]
},
"statsd_counters": {
"description": "beast::insight counters exported via OTLP/HTTP. The OTel Prometheus exporter appends _total to monotonic counters.",
"metrics": ["xrpld_rpc_requests_total", "xrpld_ledger_fetches_total"]
},
"overlay_traffic": {
"description": "Overlay traffic metrics (subset — full list has 45+ categories).",
"metrics": [
"xrpld_total_Bytes_In",
"xrpld_total_Bytes_Out",
"xrpld_total_Messages_In",
"xrpld_total_Messages_Out"
]
},
"phase9_nodestore": {
"description": "Phase 9 NodeStore I/O observable gauge (MetricsRegistry via OTLP). Single metric with 'metric' label distinguishing sub-metrics.",
"metrics": ["xrpld_nodestore_state"]
},
"phase9_cache": {
"description": "Phase 9 cache hit rate observable gauge (MetricsRegistry via OTLP). Single metric with 'metric' label.",
"metrics": ["xrpld_cache_metrics"]
},
"phase9_txq": {
"description": "Phase 9 transaction queue observable gauge (MetricsRegistry via OTLP). Single metric with 'metric' label.",
"metrics": ["xrpld_txq_metrics"]
},
"phase9_rpc_method": {
"description": "Phase 9 per-RPC-method counters (MetricsRegistry via OTLP).",
"metrics": ["xrpld_rpc_method_started_total"]
},
"phase9_objects": {
"description": "Phase 9 counted object instances observable gauge (MetricsRegistry via OTLP).",
"metrics": ["xrpld_object_count"]
},
"phase9_load": {
"description": "Phase 9 fee escalation and load factor observable gauge (MetricsRegistry via OTLP).",
"metrics": ["xrpld_load_factor_metrics"]
},
"parity_validation_agreement": {
"description": "External dashboard parity: validation agreement percentages (MetricsRegistry).",
"metrics": [
"xrpld_validation_agreement{metric=\"agreement_pct_1h\"}",
"xrpld_validation_agreement{metric=\"agreement_pct_24h\"}"
]
},
"parity_validator_health": {
"description": "External dashboard parity: validator health indicators (MetricsRegistry).",
"metrics": [
"xrpld_validator_health{metric=\"amendment_blocked\"}",
"xrpld_validator_health{metric=\"unl_expiry_days\"}"
]
},
"parity_peer_quality": {
"description": "External dashboard parity: peer quality metrics (MetricsRegistry).",
"metrics": [
"xrpld_peer_quality{metric=\"peer_latency_p90_ms\"}",
"xrpld_peer_quality{metric=\"peers_insane_count\"}"
]
},
"parity_ledger_economy": {
"description": "External dashboard parity: ledger economy metrics (MetricsRegistry).",
"metrics": [
"xrpld_ledger_economy{metric=\"base_fee_xrp\"}",
"xrpld_ledger_economy{metric=\"transaction_rate\"}"
]
},
"parity_state_tracking": {
"description": "External dashboard parity: server state tracking (MetricsRegistry).",
"metrics": ["xrpld_state_tracking{metric=\"state_value\"}"]
},
"parity_counters": {
"description": "External dashboard parity: monotonic counters (MetricsRegistry).",
"metrics": [
"xrpld_ledgers_closed_total",
"xrpld_validations_sent_total",
"xrpld_state_changes_total"
]
},
"parity_storage": {
"description": "External dashboard parity: storage detail metrics (MetricsRegistry).",
"metrics": ["xrpld_storage_detail{metric=\"nudb_bytes\"}"]
},
"grafana_dashboards": {
"description": "All Grafana dashboards that must render data (UIDs as provisioned on disk under docker/telemetry/grafana/dashboards/).",
"uids": [
"xrpld-rpc-perf",
"xrpld-rpc-perf-otel",
"xrpld-transactions",
"xrpld-consensus",
"xrpld-ledger-ops",
"xrpld-peer-net",
"xrpld-peer-quality",
"xrpld-fee-market",
"xrpld-job-queue",
"xrpld-validator-health",
"xrpld-system-node-health",
"xrpld-system-network",
"xrpld-system-rpc",
"xrpld-system-overlay-detail",
"xrpld-system-ledger-sync"
]
}
}