mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-06 02:07:07 +00:00
The Phase 10 validation harness had drifted from the code's recording surface and the telemetry-validation CI job was failing before it could build. CI fix (telemetry-validation.yml): - Replace nonexistent local action ./.github/actions/print-env with the remote XRPLF/actions/print-build-env (the build-xrpld job failed in 56s on this). - Sync prepare-runner and upload-artifact action SHAs to the canonical workflow. Recording-surface reconciliation (docker/telemetry/workload/): - Migrate span attributes from dotted xrpl.<domain>.<field> to the bare/underscore form introduced by the 2026-05-13 span-attr naming redesign (tx_hash, peer_id, ledger_seq, consensus_mode, consensus_round, full_validation, quorum, ...). Dotted xrpl.ledger.hash is retained only on peer.validation.receive (shared constant), while consensus.validation.send uses bare ledger_hash. - Fix attribute placement: tx.apply carries tx_count/tx_failed (not ledger_seq); ledger.build carries ledger_seq/close_* (not tx_count/tx_failed). - Replace the phantom rpc.request span with the real WS root rpc.ws_message; drop the never-emitted duration_ms; rebuild the parent-child map accordingly. - Add the new spans the code emits: apply-pipeline stage spans (tx.preflight/preclaim/transactor with stage/tx_type/ter_result), txq.*, consensus sub-spans (round/establish/update_positions/check/phase.open), ledger.acquire, grpc.*, pathfind.*. Conditional spans are marked optional so they are skipped (not failed) when the workload does not exercise them. - validate_telemetry.py: service.name and Loki job label rippled -> xrpld; fix PARITY_SPAN_ATTRS (rename the 4 real attrs, drop the 3 that are metrics not span attrs); add optional-span handling that skips missing optional spans while still validating attributes when present. - expected_metrics.json: rippled_ -> xrpld_ on all beast::insight/overlay metrics, xrpld_job_count, the 15 on-disk xrpld-* dashboard UIDs, and the real bare spanmetrics dimension labels. - regression-metrics.json + baseline-timings.json: rpc.request -> rpc.ws_message. Metrics pipeline fix: - Switch node [insight] config from server=statsd/prefix=rippled to server=otel + /v1/metrics endpoint + prefix=xrpld across run-full-validation.sh, xrpld-validator.cfg.template, benchmark.sh and the workload compose. The collector has no StatsD receiver, so system metrics only reach Prometheus over OTLP. Synthetic load for new spans: - Add ripple_path_find to the RPC load generator (drives pathfind.* spans). - Add a high-TPS txq-burst workload phase to force fee escalation (drives txq.*). All facts verified against the *SpanNames.h headers and a live xrpld node + collector (Tempo service.name=xrpld, tx.preflight attrs [stage,ter_result,tx_type], 279 xrpld_ Prometheus metrics and zero rippled_). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
420 lines
14 KiB
JSON
420 lines
14 KiB
JSON
{
|
|
"description": "Expected span inventory for xrpld telemetry validation. Attribute keys follow the 2026-05-13 span-attr naming redesign (bare/underscore form; dotted xrpl.* reserved for resource attributes). Sourced from the *SpanNames.h headers. Spans marked \"optional\": true are conditional — they only fire under traffic the harness may not produce (e.g. gRPC client, missing-ledger fetch, mode transitions) and are not failed when absent.",
|
|
"spans": [
|
|
{
|
|
"name": "rpc.ws_message",
|
|
"category": "rpc",
|
|
"parent": null,
|
|
"required_attributes": ["command"],
|
|
"config_flag": "trace_rpc",
|
|
"note": "WebSocket RPC root span. The load generator uses WS, so this is the RPC entry span (not rpc.http_request, which needs an HTTP/JSON-RPC client)."
|
|
},
|
|
{
|
|
"name": "rpc.process",
|
|
"category": "rpc",
|
|
"parent": "rpc.ws_message",
|
|
"required_attributes": [],
|
|
"config_flag": "trace_rpc"
|
|
},
|
|
{
|
|
"name": "rpc.command.*",
|
|
"category": "rpc",
|
|
"parent": "rpc.process",
|
|
"required_attributes": ["command", "version", "rpc_role", "rpc_status"],
|
|
"config_flag": "trace_rpc",
|
|
"note": "Wildcard — matches rpc.command.server_info, rpc.command.ledger, etc."
|
|
},
|
|
{
|
|
"name": "rpc.http_request",
|
|
"category": "rpc",
|
|
"parent": null,
|
|
"required_attributes": ["request_payload_size"],
|
|
"config_flag": "trace_rpc",
|
|
"optional": true,
|
|
"note": "HTTP/JSON-RPC root span. The harness load generator is WebSocket-only, so this does not fire."
|
|
},
|
|
{
|
|
"name": "tx.process",
|
|
"category": "transaction",
|
|
"parent": null,
|
|
"required_attributes": ["tx_hash", "local", "path"],
|
|
"config_flag": "trace_transactions"
|
|
},
|
|
{
|
|
"name": "tx.receive",
|
|
"category": "transaction",
|
|
"parent": null,
|
|
"required_attributes": ["tx_hash", "peer_id", "suppressed", "tx_status"],
|
|
"config_flag": "trace_transactions",
|
|
"note": "Cross-node span: parent context propagated from the sender's tx.process via protobuf. Also carries tx_type and peer_version."
|
|
},
|
|
{
|
|
"name": "tx.apply",
|
|
"category": "transaction",
|
|
"parent": "ledger.build",
|
|
"required_attributes": ["tx_count", "tx_failed"],
|
|
"config_flag": "trace_transactions",
|
|
"note": "Apply-step span inside BuildLedger. Carries tx_count/tx_failed (ledger_seq lives on the parent ledger.build span)."
|
|
},
|
|
{
|
|
"name": "tx.preflight",
|
|
"category": "transaction",
|
|
"parent": null,
|
|
"required_attributes": ["stage", "tx_type", "ter_result"],
|
|
"config_flag": "trace_transactions",
|
|
"note": "Apply-pipeline stage span (stage=preflight). Shares a deterministic trace_id (txID[0:16]) with tx.preclaim/tx.transactor."
|
|
},
|
|
{
|
|
"name": "tx.preclaim",
|
|
"category": "transaction",
|
|
"parent": null,
|
|
"required_attributes": ["stage", "tx_type", "ter_result"],
|
|
"config_flag": "trace_transactions",
|
|
"note": "Apply-pipeline stage span (stage=preclaim)."
|
|
},
|
|
{
|
|
"name": "tx.transactor",
|
|
"category": "transaction",
|
|
"parent": null,
|
|
"required_attributes": ["stage", "tx_type"],
|
|
"config_flag": "trace_transactions",
|
|
"note": "Apply-pipeline stage span (stage=apply). Also carries applied."
|
|
},
|
|
{
|
|
"name": "txq.enqueue",
|
|
"category": "transaction",
|
|
"parent": "tx.process",
|
|
"required_attributes": [
|
|
"tx_hash",
|
|
"tx_type",
|
|
"txq_status",
|
|
"fee_level_paid",
|
|
"required_fee_level"
|
|
],
|
|
"config_flag": "trace_transactions",
|
|
"optional": true,
|
|
"note": "Only fires when a tx is queued (fee below open-ledger level). Requires fee escalation — driven by the txq-burst workload phase."
|
|
},
|
|
{
|
|
"name": "txq.apply_direct",
|
|
"category": "transaction",
|
|
"parent": "txq.enqueue",
|
|
"required_attributes": [],
|
|
"config_flag": "trace_transactions",
|
|
"optional": true,
|
|
"note": "Child of txq.enqueue when the tx applies directly without queueing."
|
|
},
|
|
{
|
|
"name": "txq.batch_clear",
|
|
"category": "transaction",
|
|
"parent": "txq.enqueue",
|
|
"required_attributes": ["num_cleared"],
|
|
"config_flag": "trace_transactions",
|
|
"optional": true
|
|
},
|
|
{
|
|
"name": "txq.accept",
|
|
"category": "transaction",
|
|
"parent": null,
|
|
"required_attributes": ["queue_size", "ledger_changed"],
|
|
"config_flag": "trace_transactions",
|
|
"optional": true,
|
|
"note": "Ledger-close accept loop. Fires on the consensus thread; only meaningful when the queue is non-empty."
|
|
},
|
|
{
|
|
"name": "txq.accept.tx",
|
|
"category": "transaction",
|
|
"parent": "txq.accept",
|
|
"required_attributes": [
|
|
"tx_hash",
|
|
"ter_code",
|
|
"retries_remaining",
|
|
"txq_status"
|
|
],
|
|
"config_flag": "trace_transactions",
|
|
"optional": true
|
|
},
|
|
{
|
|
"name": "txq.cleanup",
|
|
"category": "transaction",
|
|
"parent": null,
|
|
"required_attributes": ["ledger_seq", "expired_count"],
|
|
"config_flag": "trace_transactions",
|
|
"optional": true
|
|
},
|
|
{
|
|
"name": "consensus.round",
|
|
"category": "consensus",
|
|
"parent": null,
|
|
"required_attributes": [
|
|
"consensus_ledger_id",
|
|
"ledger_seq",
|
|
"consensus_mode",
|
|
"consensus_round_id",
|
|
"consensus_phase"
|
|
],
|
|
"config_flag": "trace_consensus",
|
|
"note": "Root consensus span created per round. Also carries trace_strategy, previous_ledger_seq, previous_proposers, previous_round_time_ms."
|
|
},
|
|
{
|
|
"name": "consensus.phase.open",
|
|
"category": "consensus",
|
|
"parent": "consensus.round",
|
|
"required_attributes": [],
|
|
"config_flag": "trace_consensus"
|
|
},
|
|
{
|
|
"name": "consensus.proposal.send",
|
|
"category": "consensus",
|
|
"parent": "consensus.round",
|
|
"required_attributes": ["consensus_round"],
|
|
"config_flag": "trace_consensus",
|
|
"note": "Also carries is_bow_out."
|
|
},
|
|
{
|
|
"name": "consensus.ledger_close",
|
|
"category": "consensus",
|
|
"parent": "consensus.round",
|
|
"required_attributes": ["ledger_seq", "consensus_mode"],
|
|
"config_flag": "trace_consensus",
|
|
"note": "Also carries tx_count_open, close_time_resolution_ms."
|
|
},
|
|
{
|
|
"name": "consensus.establish",
|
|
"category": "consensus",
|
|
"parent": "consensus.round",
|
|
"required_attributes": [
|
|
"converge_percent",
|
|
"establish_count",
|
|
"proposers",
|
|
"disputes_count"
|
|
],
|
|
"config_flag": "trace_consensus"
|
|
},
|
|
{
|
|
"name": "consensus.update_positions",
|
|
"category": "consensus",
|
|
"parent": "consensus.round",
|
|
"required_attributes": [
|
|
"converge_percent",
|
|
"proposers",
|
|
"disputes_count"
|
|
],
|
|
"config_flag": "trace_consensus"
|
|
},
|
|
{
|
|
"name": "consensus.check",
|
|
"category": "consensus",
|
|
"parent": "consensus.round",
|
|
"required_attributes": [
|
|
"agree_count",
|
|
"disagree_count",
|
|
"threshold_percent",
|
|
"consensus_result"
|
|
],
|
|
"config_flag": "trace_consensus"
|
|
},
|
|
{
|
|
"name": "consensus.accept",
|
|
"category": "consensus",
|
|
"parent": "consensus.round",
|
|
"required_attributes": ["proposers", "round_time_ms", "quorum"],
|
|
"config_flag": "trace_consensus"
|
|
},
|
|
{
|
|
"name": "consensus.accept.apply",
|
|
"category": "consensus",
|
|
"parent": "consensus.accept",
|
|
"required_attributes": [
|
|
"ledger_seq",
|
|
"close_time",
|
|
"parent_close_time",
|
|
"close_time_self",
|
|
"close_time_vote_bins",
|
|
"resolution_direction"
|
|
],
|
|
"config_flag": "trace_consensus",
|
|
"note": "Also carries close_time_correct, close_resolution_ms, consensus_state, proposing, round_time_ms, tx_count."
|
|
},
|
|
{
|
|
"name": "consensus.validation.send",
|
|
"category": "consensus",
|
|
"parent": null,
|
|
"required_attributes": [
|
|
"ledger_seq",
|
|
"proposing",
|
|
"ledger_hash",
|
|
"full_validation"
|
|
],
|
|
"config_flag": "trace_consensus",
|
|
"note": "follows-from consensus.accept. ledger_hash is BARE here (consensus-owned attr), unlike peer.validation.receive which uses the shared dotted xrpl.ledger.hash. Also carries validation_sign_time."
|
|
},
|
|
{
|
|
"name": "consensus.proposal.receive",
|
|
"category": "consensus",
|
|
"parent": null,
|
|
"required_attributes": [],
|
|
"config_flag": "trace_consensus",
|
|
"note": "Context-propagated from the sending peer. No required local attributes."
|
|
},
|
|
{
|
|
"name": "consensus.validation.receive",
|
|
"category": "consensus",
|
|
"parent": null,
|
|
"required_attributes": [],
|
|
"config_flag": "trace_consensus",
|
|
"note": "Context-propagated from the sending peer. No required local attributes."
|
|
},
|
|
{
|
|
"name": "consensus.mode_change",
|
|
"category": "consensus",
|
|
"parent": null,
|
|
"required_attributes": ["mode_old", "mode_new"],
|
|
"config_flag": "trace_consensus",
|
|
"optional": true,
|
|
"note": "Only fires on an operating-mode transition; a steady cluster rarely changes mode after warmup."
|
|
},
|
|
{
|
|
"name": "ledger.build",
|
|
"category": "ledger",
|
|
"parent": null,
|
|
"required_attributes": [
|
|
"ledger_seq",
|
|
"close_time",
|
|
"close_time_correct",
|
|
"close_resolution_ms"
|
|
],
|
|
"config_flag": "trace_ledger",
|
|
"note": "tx_count/tx_failed live on the child tx.apply span, not here."
|
|
},
|
|
{
|
|
"name": "ledger.validate",
|
|
"category": "ledger",
|
|
"parent": null,
|
|
"required_attributes": ["ledger_seq", "validations"],
|
|
"config_flag": "trace_ledger"
|
|
},
|
|
{
|
|
"name": "ledger.store",
|
|
"category": "ledger",
|
|
"parent": null,
|
|
"required_attributes": ["ledger_seq"],
|
|
"config_flag": "trace_ledger"
|
|
},
|
|
{
|
|
"name": "ledger.acquire",
|
|
"category": "ledger",
|
|
"parent": null,
|
|
"required_attributes": [
|
|
"ledger_seq",
|
|
"acquire_reason",
|
|
"timeouts",
|
|
"peer_count",
|
|
"outcome"
|
|
],
|
|
"config_flag": "trace_ledger",
|
|
"optional": true,
|
|
"note": "Only fires when a node must fetch a missing ledger (InboundLedger). A healthy local cluster rarely back-fills history."
|
|
},
|
|
{
|
|
"name": "peer.proposal.receive",
|
|
"category": "peer",
|
|
"parent": null,
|
|
"required_attributes": ["peer_id", "proposal_trusted"],
|
|
"config_flag": "trace_peer"
|
|
},
|
|
{
|
|
"name": "peer.validation.receive",
|
|
"category": "peer",
|
|
"parent": null,
|
|
"required_attributes": [
|
|
"peer_id",
|
|
"validation_trusted",
|
|
"xrpl.ledger.hash",
|
|
"validation_full"
|
|
],
|
|
"config_flag": "trace_peer",
|
|
"note": "Uses the shared dotted xrpl.ledger.hash constant (intentionally dotted, unlike consensus.validation.send)."
|
|
},
|
|
{
|
|
"name": "pathfind.request",
|
|
"category": "pathfind",
|
|
"parent": null,
|
|
"required_attributes": [
|
|
"pathfind_source_account",
|
|
"pathfind_dest_account"
|
|
],
|
|
"config_flag": "trace_rpc",
|
|
"note": "Fires on ripple_path_find / path_find RPC. Driven by the ripple_path_find load in rpc_load_generator.py."
|
|
},
|
|
{
|
|
"name": "pathfind.compute",
|
|
"category": "pathfind",
|
|
"parent": "pathfind.request",
|
|
"required_attributes": ["pathfind_fast"],
|
|
"config_flag": "trace_rpc"
|
|
},
|
|
{
|
|
"name": "pathfind.discover",
|
|
"category": "pathfind",
|
|
"parent": "pathfind.compute",
|
|
"required_attributes": ["pathfind_search_level", "pathfind_num_paths"],
|
|
"config_flag": "trace_rpc"
|
|
},
|
|
{
|
|
"name": "pathfind.update_all",
|
|
"category": "pathfind",
|
|
"parent": null,
|
|
"required_attributes": ["pathfind_ledger_index", "pathfind_num_requests"],
|
|
"config_flag": "trace_rpc",
|
|
"optional": true,
|
|
"note": "Async recomputation at ledger close; only fires when there are active path_find subscriptions (the one-shot ripple_path_find load does not register one)."
|
|
},
|
|
{
|
|
"name": "grpc.*",
|
|
"category": "grpc",
|
|
"parent": null,
|
|
"required_attributes": ["method", "grpc_role", "grpc_status"],
|
|
"config_flag": "trace_rpc",
|
|
"optional": true,
|
|
"note": "Wildcard — grpc.<MethodName>. The harness has no gRPC client, so these do not fire. Tracked for completeness."
|
|
}
|
|
],
|
|
"parent_child_relationships": [
|
|
{
|
|
"parent": "rpc.ws_message",
|
|
"child": "rpc.process",
|
|
"description": "WebSocket message contains processing span",
|
|
"skip": true,
|
|
"skip_reason": "rpc.ws_message and rpc.process run on different threads (the WS handler posts a coroutine to JobQueue for processing). Span context is not propagated across the thread boundary. Requires a C++ fix to capture and forward the span context through the coroutine lambda."
|
|
},
|
|
{
|
|
"parent": "rpc.process",
|
|
"child": "rpc.command.*",
|
|
"description": "Processing span contains per-command span"
|
|
},
|
|
{
|
|
"parent": "ledger.build",
|
|
"child": "tx.apply",
|
|
"description": "Ledger build contains transaction application"
|
|
},
|
|
{
|
|
"parent": "consensus.round",
|
|
"child": "consensus.accept",
|
|
"description": "Consensus round contains the accept sub-span"
|
|
},
|
|
{
|
|
"parent": "consensus.accept",
|
|
"child": "consensus.accept.apply",
|
|
"description": "Accept contains the ledger-apply sub-span"
|
|
},
|
|
{
|
|
"parent": "pathfind.request",
|
|
"child": "pathfind.compute",
|
|
"description": "Pathfind request contains the compute sub-span"
|
|
}
|
|
],
|
|
"total_span_types": 40,
|
|
"total_unique_attributes": 58
|
|
}
|