diff --git a/OpenTelemetryPlan/Phase2_taskList.md b/OpenTelemetryPlan/Phase2_taskList.md index f473617cf8..1f572ecfa9 100644 --- a/OpenTelemetryPlan/Phase2_taskList.md +++ b/OpenTelemetryPlan/Phase2_taskList.md @@ -154,21 +154,53 @@ Node health (`amendment_blocked`, `server_state`) is not part of the telemetry s --- +## Task 2.10: RPC and PathFind Span Attribute Gap Fill + +**Status**: COMPLETE + +**Objective**: Wire up workflow-identifying attributes that enable filtering and grouping traces by request characteristics without drilling into child spans. + +**Attributes added**: + +| Span | Attribute | Type | Source | +| ------------------- | ---------------------------- | ------ | --------------------------------- | +| `rpc.http_request` | `request_payload_size` | int64 | `request.body().size()` | +| `rpc.process` | `is_batch` | bool | `method == "batch"` check | +| `rpc.process` | `batch_size` | int64 | `params.size()` (only when batch) | +| `rpc.ws_message` | `command` | string | `jv[command]` or `jv[method]` | +| `rpc.command.*` | `load_type` | string | `context.loadType.label()` | +| `pathfind.compute` | `pathfind_dest_amount` | string | `saDstAmount_.getFullText()` | +| `pathfind.compute` | `pathfind_dest_currency` | string | `to_string(saDstAmount_.asset())` | +| `pathfind.discover` | `pathfind_num_source_assets` | int64 | `sourceAssets.size()` | + +**New attr keys**: `RpcSpanNames.h` (`isBatch`, `batchSize`, `loadType`), `PathFindSpanNames.h` (`destAmount`, `destCurrency`, `numSourceAssets`). + +**Modified files**: + +- `src/xrpld/rpc/detail/RpcSpanNames.h` +- `src/xrpld/rpc/detail/PathFindSpanNames.h` +- `src/xrpld/rpc/detail/ServerHandler.cpp` +- `src/xrpld/rpc/detail/RPCHandler.cpp` +- `src/xrpld/rpc/detail/PathRequest.cpp` + +--- + ## Summary -| Task | Description | Status | Notes | -| ---- | ------------------------------------------- | ------------------- | ------------------------------------------------ | -| 2.1 | W3C Trace Context header extraction | Deferred → Phase 3 | No consumer in Phase 2; needs cross-node tracing | -| 2.2 | Per-category span creation | Complete (Phase 1c) | Superseded by TraceCategory enum + SpanGuard | -| 2.3 | Add shouldTraceLedger() interface method | Complete (Phase 1c) | Delivered in Phase 1c base branch | -| 2.4 | Unit tests for core telemetry | Complete | TelemetryConfig + SpanGuardFactory tests | -| 2.5 | Enhanced RPC span attributes (HTTP-level) | Deferred | Low value; span duration covers timing natively | -| 2.6 | Build verification and performance baseline | Complete | Verified in CI on Phase 1c | -| 2.7 | Grafana Tempo search filters | Complete | rpc-command, rpc-status, rpc-role filters | -| 2.8 | RPC span attribute enrichment (node health) | Dropped | Available via `server_info`/`server_state` RPC | -| 2.9 | PathFind RPC instrumentation | Complete | request, compute, update_all, discover | +| Task | Description | Status | Notes | +| ---- | ------------------------------------------- | ------------------- | --------------------------------------------------------- | +| 2.1 | W3C Trace Context header extraction | Deferred → Phase 3 | No consumer in Phase 2; needs cross-node tracing | +| 2.2 | Per-category span creation | Complete (Phase 1c) | Superseded by TraceCategory enum + SpanGuard | +| 2.3 | Add shouldTraceLedger() interface method | Complete (Phase 1c) | Delivered in Phase 1c base branch | +| 2.4 | Unit tests for core telemetry | Complete | TelemetryConfig + SpanGuardFactory tests | +| 2.5 | Enhanced RPC span attributes (HTTP-level) | Deferred | Low value; span duration covers timing natively | +| 2.6 | Build verification and performance baseline | Complete | Verified in CI on Phase 1c | +| 2.7 | Grafana Tempo search filters | Complete | rpc-command, rpc-status, rpc-role filters | +| 2.8 | RPC span attribute enrichment (node health) | Dropped | Available via `server_info`/`server_state` RPC | +| 2.9 | PathFind RPC instrumentation | Complete | request, compute, update_all, discover | +| 2.10 | RPC/PathFind span attribute gap fill | Complete | Batch detection, payload size, load cost, pathfind params | -**Delivered in this branch**: Tasks 2.4, 2.7, 2.9. +**Delivered in this branch**: Tasks 2.4, 2.7, 2.9, 2.10. **Deferred with rationale**: Tasks 2.1 (→Phase 3), 2.5 (low priority). **Dropped**: Task 2.8 (node health not duplicated on traces). **Superseded**: Task 2.2 (Phase 1c SpanGuard factory covers this). diff --git a/OpenTelemetryPlan/Phase3_taskList.md b/OpenTelemetryPlan/Phase3_taskList.md index f352a1f862..1d7d181f0e 100644 --- a/OpenTelemetryPlan/Phase3_taskList.md +++ b/OpenTelemetryPlan/Phase3_taskList.md @@ -466,6 +466,38 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ --- +## Task 3.11: TX and TxQ Span Attribute Gap Fill + +**Status**: COMPLETE + +**Objective**: Add workflow-identifying attributes to transaction spans so operators can filter by transaction type and see outcomes without off-chain correlation. + +**Attributes added**: + +| Span | Attribute | Type | Source | +| --------------- | ---------------- | ------ | ------------------------------------------------------------------- | +| `tx.process` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` | +| `tx.process` | `fee` | int64 | `stx->getFieldAmount(sfFee).xrp().drops()` | +| `tx.process` | `sequence` | int64 | `stx->getSeqProxy().value()` | +| `tx.process` | `ter_result` | string | `transToken(e.result)` (set after batch application) | +| `tx.process` | `applied` | bool | `e.applied` (set after batch application) | +| `tx.receive` | `tx_type` | string | `TxFormats::getInstance().findByType(stx->getTxnType())->getName()` | +| `txq.enqueue` | `tx_type` | string | same pattern as above | +| `txq.accept.tx` | `txq_status` | string | `applied` / `failed` / `retried` | +| `txq.accept` | `ledger_changed` | bool | set at end of accept loop | + +**New attr keys**: `TxSpanNames.h` (`txType`, `fee`, `sequence`, `terResult`, `applied`), `TxQSpanNames.h` (`txType`). + +**Modified files**: + +- `src/xrpld/app/misc/TxSpanNames.h` +- `src/xrpld/app/misc/detail/TxQSpanNames.h` +- `src/xrpld/app/misc/NetworkOPs.cpp` +- `src/xrpld/overlay/detail/PeerImp.cpp` +- `src/xrpld/app/misc/detail/TxQ.cpp` + +--- + ## Summary | Task | Description | New Files | Modified Files | Depends On | @@ -480,6 +512,7 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ | 3.8 | TX span peer version attribute | 0 | 1 | 3.3 | | 3.9 | Deterministic transaction trace ID | 0-1 | 3 | 3.2, 3.3 | | 3.10 | TxQ instrumentation (6 spans) | 1 | 1 | 3.4 | +| 3.11 | TX/TxQ span attribute gap fill | 0 | 5 | 3.3, 3.10 | **Parallel work**: Tasks 3.1 and 3.4 can start in parallel. Task 3.2 depends on 3.1. Tasks 3.3 and 3.5 depend on 3.2. Task 3.6 depends on 3.3 and 3.5. Task 3.8 depends on 3.3 (span must exist). Task 3.9 depends on 3.2 and 3.3. Task 3.10 depends on 3.4 (tx.process span must exist). diff --git a/OpenTelemetryPlan/Phase4_taskList.md b/OpenTelemetryPlan/Phase4_taskList.md index 4c93f7f1ee..6d0f169546 100644 --- a/OpenTelemetryPlan/Phase4_taskList.md +++ b/OpenTelemetryPlan/Phase4_taskList.md @@ -255,14 +255,38 @@ Phase 7's `ValidationTracker` builds metric-level aggregation (1h/24h agreement **Exit Criteria**: -- [ ] `consensus.validation.send` spans carry `xrpl.validation.ledger_hash` and `xrpl.validation.full` +- [x] `consensus.validation.send` spans carry `ledger_hash` and `full_validation` - [ ] `peer.validation.receive` spans carry `xrpl.peer.validation.ledger_hash` and `xrpl.peer.validation.full` - [ ] `consensus.accept` spans carry `validation_quorum` and `proposers_validated` -- [ ] Ledger hash attributes match between send and receive for the same ledger +- [x] Ledger hash attributes match between send and receive for the same ledger - [ ] No impact on consensus performance --- +## Task 4.9: Consensus Span Attribute Gap Fill + +**Status**: COMPLETE + +**Objective**: Add workflow-critical attributes to consensus spans that enable operators to understand consensus outcomes, identify bow-out proposals, and correlate validations to specific ledgers. + +**Attributes added**: + +| Span | Attribute | Type | Source | +| --------------------------- | ----------------- | ------ | ------------------------------------- | +| `consensus.proposal.send` | `is_bow_out` | bool | `proposal.isBowOut()` | +| `consensus.accept` | `consensus_state` | string | `result.state` (yes/moved_on/expired) | +| `consensus.accept` | `disputes_count` | int64 | `result.disputes.size()` | +| `consensus.validation.send` | `ledger_hash` | string | `ledger.ledger->header().hash` | + +**New attr keys**: `ConsensusSpanNames.h` (`isBowOut`, `ledgerHash`). + +**Modified files**: + +- `src/xrpld/consensus/ConsensusSpanNames.h` +- `src/xrpld/app/consensus/RCLConsensus.cpp` + +--- + ## Summary | Task | Description | Status | New Files | Modified Files | Depends On | @@ -275,6 +299,7 @@ Phase 7's `ValidationTracker` builds metric-level aggregation (1h/24h agreement | 4.6 | Transaction-consensus correlation | ✅ Done | 0 | 1 | 4.2, Phase 3 | | 4.7 | Build verification and testing | ✅ Done | 0 | 0 | 4.1-4.6 | | 4.8 | Validation span enrichment (ext. dashboard) | ❌ Not done | 0 | 2 | 4.4 | +| 4.9 | Consensus span attribute gap fill | ✅ Done | 0 | 2 | 4.1-4.5 | **Parallel work**: Tasks 4.2, 4.3, and 4.4 can run in parallel after 4.1 is complete. Task 4.5 depends on all three. Task 4.6 depends on 4.2 and Phase 3. Task 4.8 depends on 4.4 (validation spans must exist). @@ -282,11 +307,11 @@ Phase 7's `ValidationTracker` builds metric-level aggregation (1h/24h agreement | Span Name | Method | Key Attributes | | --------------------------- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `consensus.proposal.send` | `Adaptor::propose` | `xrpl.consensus.round` | +| `consensus.proposal.send` | `Adaptor::propose` | `xrpl.consensus.round`, `is_bow_out` | | `consensus.ledger_close` | `Adaptor::onClose` | `xrpl.ledger.seq`, `xrpl.consensus.mode` | -| `consensus.accept` | `Adaptor::onAccept` | `proposers`, `round_time_ms` | +| `consensus.accept` | `Adaptor::onAccept` | `proposers`, `round_time_ms`, `quorum`, `disputes_count`, `consensus_state` | | `consensus.accept.apply` | `Adaptor::doAccept` | `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `xrpl.ledger.seq`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction` | -| `consensus.validation.send` | `Adaptor::onAccept` (via validate) | `proposing` | +| `consensus.validation.send` | `Adaptor::onAccept` (via validate) | `proposing`, `ledger_hash`, `ledger_seq`, `full_validation`, `validation_sign_time` | #### Close Time Attributes (consensus.accept.apply) diff --git a/docs/telemetry-runbook.md b/docs/telemetry-runbook.md index 8779cf5a74..5b7c021f7f 100644 --- a/docs/telemetry-runbook.md +++ b/docs/telemetry-runbook.md @@ -65,32 +65,32 @@ All spans instrumented in xrpld, grouped by subsystem: ### RPC Spans (Phase 2) -| Span Name | Source File | Attributes | Description | -| -------------------- | ----------------- | ------------------------------------------------------------------------------ | ----------------------------------------------------- | -| `rpc.http_request` | ServerHandler.cpp | — | Top-level HTTP RPC request | -| `rpc.ws_upgrade` | ServerHandler.cpp | — | WebSocket upgrade handshake | -| `rpc.ws_message` | ServerHandler.cpp | — | WebSocket RPC message | -| `rpc.process` | ServerHandler.cpp | — | RPC processing (child of rpc.http_request/ws_message) | -| `rpc.command.` | RPCHandler.cpp | `command`, `version`, `rpc_role`, `rpc_status`, `duration_ms`, `error_message` | Per-command span (e.g., `rpc.command.server_info`) | +| Span Name | Source File | Attributes | Description | +| -------------------- | ----------------- | ----------------------------------------------------------- | ----------------------------------------------------- | +| `rpc.http_request` | ServerHandler.cpp | `request_payload_size` | Top-level HTTP RPC request | +| `rpc.ws_upgrade` | ServerHandler.cpp | — | WebSocket upgrade handshake | +| `rpc.ws_message` | ServerHandler.cpp | `command` | WebSocket RPC message | +| `rpc.process` | ServerHandler.cpp | `is_batch`, `batch_size` | RPC processing (child of rpc.http_request/ws_message) | +| `rpc.command.` | RPCHandler.cpp | `command`, `version`, `rpc_role`, `rpc_status`, `load_type` | Per-command span (e.g., `rpc.command.server_info`) | ### Transaction Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | ------------------------------------- | -| `tx.process` | NetworkOPs.cpp | `xrpl.tx.hash`, `local`, `path` | Transaction submission and processing | -| `tx.receive` | PeerImp.cpp | `xrpl.peer.id`, `xrpl.tx.hash`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | -| `tx.apply` | BuildLedger.cpp | `xrpl.ledger.seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | +| Span Name | Source File | Attributes | Description | +| ------------ | --------------- | -------------------------------------------------------------------------------------- | ------------------------------------- | +| `tx.process` | NetworkOPs.cpp | `xrpl.tx.hash`, `local`, `path`, `tx_type`, `fee`, `sequence`, `ter_result`, `applied` | Transaction submission and processing | +| `tx.receive` | PeerImp.cpp | `xrpl.peer.id`, `xrpl.tx.hash`, `tx_type`, `peer_version`, `suppressed`, `tx_status` | Transaction received from peer relay | +| `tx.apply` | BuildLedger.cpp | `xrpl.ledger.seq`, `tx_count`, `tx_failed` | Transaction set applied per ledger | ### Transaction Queue Spans (Phase 3) -| Span Name | Source File | Attributes | Description | -| ------------------ | ----------- | ----------------------------------------------- | -------------------------------------------------- | -| `txq.enqueue` | TxQ.cpp | `xrpl.tx.hash` | Transaction enqueue decision (child of tx.process) | -| `txq.apply_direct` | TxQ.cpp | -- | Direct apply attempt (bypassing queue) | -| `txq.batch_clear` | TxQ.cpp | -- | Batch clear of queued transactions for an account | -| `txq.accept` | TxQ.cpp | `queue_size` | Ledger-close accept loop over queued transactions | -| `txq.accept_tx` | TxQ.cpp | `xrpl.tx.hash`, `retries_remaining`, `ter_code` | Per-transaction apply during accept | -| `txq.cleanup` | TxQ.cpp | `xrpl.ledger.seq` | Post-close cleanup of expired queue entries | +| Span Name | Source File | Attributes | Description | +| ------------------ | ----------- | ------------------------------------------------------------- | -------------------------------------------------- | +| `txq.enqueue` | TxQ.cpp | `xrpl.tx.hash`, `tx_type` | Transaction enqueue decision (child of tx.process) | +| `txq.apply_direct` | TxQ.cpp | -- | Direct apply attempt (bypassing queue) | +| `txq.batch_clear` | TxQ.cpp | -- | Batch clear of queued transactions for an account | +| `txq.accept` | TxQ.cpp | `queue_size`, `ledger_changed` | Ledger-close accept loop over queued transactions | +| `txq.accept_tx` | TxQ.cpp | `xrpl.tx.hash`, `retries_remaining`, `ter_code`, `txq_status` | Per-transaction apply during accept | +| `txq.cleanup` | TxQ.cpp | `xrpl.ledger.seq` | Post-close cleanup of expired queue entries | ### Consensus Spans (Phase 4) @@ -98,14 +98,14 @@ All spans instrumented in xrpld, grouped by subsystem: | ------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `consensus.round` | RCLConsensus.cpp | `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`, `xrpl.consensus.mode`, `trace_strategy`, `xrpl.consensus.round_id` | Root span for a consensus round (deterministic or random trace ID) | | `consensus.phase.open` | Consensus.h | -- | Open phase duration (child of round) | -| `consensus.proposal.send` | RCLConsensus.cpp | `xrpl.consensus.round` | Consensus proposal broadcast | +| `consensus.proposal.send` | RCLConsensus.cpp | `xrpl.consensus.round`, `is_bow_out` | Consensus proposal broadcast | | `consensus.ledger_close` | RCLConsensus.cpp | `xrpl.ledger.seq`, `xrpl.consensus.mode` | Ledger close event | | `consensus.establish` | Consensus.h | `converge_percent`, `establish_count`, `proposers` | Establish phase duration (child of round) | | `consensus.update_positions` | Consensus.h | `converge_percent`, `proposers`, `disputes_count` | Position update and dispute resolution (see Events below) | | `consensus.check` | Consensus.h | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `consensus_result` | Consensus threshold check | -| `consensus.accept` | RCLConsensus.cpp | `proposers`, `round_time_ms`, `quorum` | Ledger accepted by consensus | +| `consensus.accept` | RCLConsensus.cpp | `proposers`, `round_time_ms`, `quorum`, `disputes_count`, `consensus_state` | Ledger accepted by consensus | | `consensus.accept.apply` | RCLConsensus.cpp | `xrpl.ledger.seq`, `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction`, `tx_count` | Ledger application with close time details (see Events below) | -| `consensus.validation.send` | RCLConsensus.cpp | `xrpl.ledger.seq`, `proposing` | Validation sent after accept (follows-from link) | +| `consensus.validation.send` | RCLConsensus.cpp | `xrpl.ledger.seq`, `proposing`, `ledger_hash`, `full_validation`, `validation_sign_time` | Validation sent after accept (follows-from link) | | `consensus.mode_change` | RCLConsensus.cpp | `mode_old`, `mode_new` | Consensus mode transition | | `consensus.proposal.receive` | PeerImp.cpp | `trusted`, `xrpl.consensus.round` | Proposal received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | | `consensus.validation.receive` | PeerImp.cpp | `trusted`, `xrpl.ledger.seq` | Validation received from peer (extracts parent context from TraceContext when present; falls back to standalone span for older peers) | @@ -139,7 +139,7 @@ All spans instrumented in xrpld, grouped by subsystem: {name="consensus.update_positions"} >> {event:name="dispute.resolve"} ``` -### Ledger Spans (Phase 5) +### Ledger Spans (Phase 6) | Span Name | Source File | Attributes | Description | | ----------------- | -------------------- | ------------------------------------------ | ----------------------------- | @@ -147,13 +147,143 @@ All spans instrumented in xrpld, grouped by subsystem: | `ledger.validate` | LedgerMaster.cpp:915 | `xrpl.ledger.seq`, `validations` | Ledger promoted to validated | | `ledger.store` | LedgerMaster.cpp:409 | `xrpl.ledger.seq` | Ledger stored in history | -### Peer Spans (Phase 5) +### Peer Spans (Phase 6) | Span Name | Source File | Attributes | Description | | ------------------------- | ---------------- | ------------------------------------ | ----------------------------- | | `peer.proposal.receive` | PeerImp.cpp:1667 | `xrpl.peer.id`, `proposal_trusted` | Proposal received from peer | | `peer.validation.receive` | PeerImp.cpp:2264 | `xrpl.peer.id`, `validation_trusted` | Validation received from peer | +--- + +## Insights and Sample Queries + +This section shows what questions you can answer using the span attributes, with example Tempo TraceQL queries. + +### Transaction Workflow Analysis + +``` +# Find all AMM transactions (AMMDeposit, AMMWithdraw, AMMCreate, etc.) +{name="tx.process"} | tx_type =~ "AMM.*" + +# Find Payment transactions that failed +{name="tx.process"} | tx_type = "Payment" && ter_result != "tesSUCCESS" + +# Compare latency of different transaction types +{name="tx.process"} | tx_type = "OfferCreate" +{name="tx.process"} | tx_type = "Payment" + +# Find high-fee transactions (fee > 1 XRP = 1000000 drops) +{name="tx.process"} | fee > 1000000 + +# Find transactions that were not applied +{name="tx.process"} | applied = false + +# Trace a specific transaction by type across the network +{name=~"tx\\..*"} | tx_type = "NFTokenMint" +``` + +### Transaction Queue Health + +``` +# Find transactions rejected from the queue +{name="txq.accept_tx"} | txq_status = "failed" + +# Which transaction types get queued most often? +{name="txq.enqueue"} | tx_type = "Payment" +{name="txq.enqueue"} | tx_type = "OfferCreate" + +# Find ledger closes that applied queued transactions +{name="txq.accept"} | ledger_changed = true + +# Find transactions that exhausted retries +{name="txq.accept_tx"} | txq_status = "retried" && retries_remaining = 0 +``` + +### RPC Debugging + +``` +# Find batch RPC requests +{name="rpc.process"} | is_batch = true + +# Find large RPC payloads (>100KB) +{name="rpc.http_request"} | request_payload_size > 100000 + +# Find resource-heavy RPC commands (by load_type) +{name=~"rpc.command.*"} | load_type = "exception_rpc" + +# Find a specific WebSocket command +{name="rpc.ws_message"} | command = "subscribe" + +# Find slow pathfinding with many source assets +{name="pathfind.discover"} | pathfind_num_source_assets > 10 +``` + +### PathFinding Performance + +``` +# Find pathfinding for specific currencies +{name="pathfind.compute"} | pathfind_dest_currency = "USD" + +# Find expensive pathfinding (many source assets to explore) +{name="pathfind.discover"} | pathfind_num_source_assets > 20 + +# Find large pathfinding requests +{name="pathfind.compute"} | duration > 1s +``` + +### Consensus Health + +``` +# Find rounds where consensus timed out (expired) +{name="consensus.accept"} | consensus_state = "expired" + +# Find rounds where we moved on without full agreement +{name="consensus.accept"} | consensus_state = "moved_on" + +# Find rounds with many disputes +{name="consensus.accept"} | disputes_count > 5 + +# Find bow-out proposals (node resigned from round) +{name="consensus.proposal.send"} | is_bow_out = true + +# Correlate validation with its ledger +{name="consensus.validation.send"} | ledger_hash = "" + +# Find rounds where validators disagreed on close time +{name="consensus.accept.apply"} | close_time_correct = false +``` + +### Cross-Subsystem Correlation + +``` +# Follow a transaction from receive through queue to ledger +{name=~"tx\\..*|txq\\..*"} | tx_type = "Payment" && duration > 500ms + +# Find all NFT-related activity +{name=~"tx\\..*|txq\\..*"} | tx_type =~ "NFToken.*" + +# Find consensus rounds with slow transactions +{name="consensus.accept"} | round_time_ms > 5000 +``` + +### Where to Look (Quick Reference) + +| Question | Span | Key Attributes | +| ----------------------------------- | --------------------------- | ------------------------------ | +| "Which tx type is slowest?" | `tx.process` | `tx_type` + duration | +| "Why was my tx rejected?" | `tx.process` | `ter_result`, `applied` | +| "Is the TxQ backing up?" | `txq.accept` | `queue_size`, `ledger_changed` | +| "Why was my tx dropped from queue?" | `txq.accept_tx` | `txq_status`, `ter_code` | +| "Are batch requests a problem?" | `rpc.process` | `is_batch`, `batch_size` | +| "Which RPC is expensive?" | `rpc.command.*` | `load_type`, duration | +| "Did consensus stall?" | `consensus.check` | `consensus_stalled` | +| "Was consensus outcome normal?" | `consensus.accept` | `consensus_state` | +| "Did a validator bow out?" | `consensus.proposal.send` | `is_bow_out` | +| "Which ledger was validated?" | `consensus.validation.send` | `ledger_hash` | + +--- + ## Cross-Node Trace Propagation xrpld propagates trace context across nodes via protobuf `TraceContext` fields diff --git a/src/tests/libxrpl/telemetry/TraceContextPropagator.cpp b/src/tests/libxrpl/telemetry/TraceContextPropagator.cpp index d566be3353..efcb172845 100644 --- a/src/tests/libxrpl/telemetry/TraceContextPropagator.cpp +++ b/src/tests/libxrpl/telemetry/TraceContextPropagator.cpp @@ -2,6 +2,7 @@ #ifdef XRPL_ENABLE_TELEMETRY +#include #include #include @@ -44,8 +45,9 @@ TEST(TraceContextPropagator, round_trip) 0x10}; std::uint8_t spanIdBuf[8] = {0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x11, 0x22}; - trace::TraceId const traceId(opentelemetry::nostd::span(traceIdBuf, 16)); - trace::SpanId const spanId(opentelemetry::nostd::span(spanIdBuf, 8)); + trace::TraceId const traceId( + opentelemetry::nostd::span(traceIdBuf, 16)); + trace::SpanId const spanId(opentelemetry::nostd::span(spanIdBuf, 8)); trace::TraceFlags const flags(trace::TraceFlags::kIsSampled); trace::SpanContext const spanCtx(traceId, spanId, flags, true); @@ -60,7 +62,7 @@ TEST(TraceContextPropagator, round_trip) EXPECT_EQ(proto.trace_id().size(), 16u); EXPECT_TRUE(proto.has_span_id()); EXPECT_EQ(proto.span_id().size(), 8u); - EXPECT_EQ(proto.trace_flags(), static_cast(trace::TraceFlags::kIsSampled)); + EXPECT_EQ(proto.trace_flags(), static_cast(trace::TraceFlags::kIsSampled)); EXPECT_EQ(std::memcmp(proto.trace_id().data(), traceIdBuf, 16), 0); EXPECT_EQ(std::memcmp(proto.span_id().data(), spanIdBuf, 8), 0); @@ -133,8 +135,8 @@ TEST(TraceContextPropagator, flags_preservation) // Test with flags NOT sampled (flags = 0) trace::TraceFlags const flags(0); trace::SpanContext const spanCtx( - trace::TraceId(opentelemetry::nostd::span(traceIdBuf, 16)), - trace::SpanId(opentelemetry::nostd::span(spanIdBuf, 8)), + trace::TraceId(opentelemetry::nostd::span(traceIdBuf, 16)), + trace::SpanId(opentelemetry::nostd::span(spanIdBuf, 8)), flags, true); diff --git a/src/xrpld/app/consensus/RCLConsensus.cpp b/src/xrpld/app/consensus/RCLConsensus.cpp index ded60634e2..1919064c75 100644 --- a/src/xrpld/app/consensus/RCLConsensus.cpp +++ b/src/xrpld/app/consensus/RCLConsensus.cpp @@ -237,6 +237,7 @@ RCLConsensus::Adaptor::propose(RCLCxPeerPos::Proposal const& proposal) telemetry::consensus::span::op::proposalSend); span.setAttribute( telemetry::consensus::span::attr::round, static_cast(proposal.proposeSeq())); + span.setAttribute(telemetry::consensus::span::attr::isBowOut, proposal.isBowOut()); JLOG(j_.trace()) << (proposal.isBowOut() ? "We bow out: " : "We propose: ") << xrpl::to_string(proposal.prevLedger()) << " -> " @@ -511,6 +512,21 @@ RCLConsensus::Adaptor::makeAcceptSpan(Result const& result) span->setAttribute( cs::attr::roundTimeMs, static_cast(result.roundTime.read().count())); span->setAttribute(cs::attr::quorum, static_cast(app_.getValidators().quorum())); + span->setAttribute(cs::attr::disputesCount, static_cast(result.disputes.size())); + char const* stateStr = [&] { + switch (result.state) + { + case ConsensusState::Yes: + return "yes"; + case ConsensusState::MovedOn: + return "moved_on"; + case ConsensusState::Expired: + return "expired"; + default: + return "no"; + } + }(); + span->setAttribute(cs::attr::consensusState, stateStr); // Capture the accept span's context so createValidationSpan() — which // runs on the jtACCEPT worker thread — can link the validation.send @@ -934,6 +950,8 @@ RCLConsensus::Adaptor::validate(RCLCxLedger const& ledger, RCLTxSet const& txns, { namespace cs = telemetry::consensus::span; valSpan->setAttribute(cs::attr::ledgerSeq, static_cast(ledger.seq())); + valSpan->setAttribute( + cs::attr::ledgerHash, to_string(ledger.ledger->header().hash).c_str()); valSpan->setAttribute(cs::attr::proposing, proposing); // proposing implies a full validation (vfFullValidation is set on // the STValidation only when proposing — see below). diff --git a/src/xrpld/app/misc/NetworkOPs.cpp b/src/xrpld/app/misc/NetworkOPs.cpp index e9924c5d02..bfbde07fba 100644 --- a/src/xrpld/app/misc/NetworkOPs.cpp +++ b/src/xrpld/app/misc/NetworkOPs.cpp @@ -1335,6 +1335,15 @@ NetworkOPsImp::processTransaction( auto span = std::make_shared(txProcessSpan(transaction->getID())); span->setAttribute(tx_span::attr::txHash, to_string(transaction->getID()).c_str()); span->setAttribute(tx_span::attr::local, bLocal); + if (auto const& stx = transaction->getSTransaction()) + { + if (auto const* fmt = TxFormats::getInstance().findByType(stx->getTxnType())) + span->setAttribute(tx_span::attr::txType, fmt->getName().c_str()); + span->setAttribute( + tx_span::attr::fee, static_cast(stx->getFieldAmount(sfFee).xrp().drops())); + span->setAttribute( + tx_span::attr::sequence, static_cast(stx->getSeqProxy().value())); + } auto ev = jobQueue_.makeLoadEvent(JtTxnProc, "ProcessTXN"); @@ -1555,6 +1564,12 @@ NetworkOPsImp::apply(std::unique_lock& batchLock) auto newOL = registry_.get().getOpenLedger().current(); for (TransactionStatus const& e : transactions) { + if (e.span && *e.span) + { + e.span->setAttribute( + telemetry::tx_span::attr::terResult, transToken(e.result).c_str()); + e.span->setAttribute(telemetry::tx_span::attr::applied, e.applied); + } e.transaction->clearSubmitResult(); if (e.applied) diff --git a/src/xrpld/app/misc/TxSpanNames.h b/src/xrpld/app/misc/TxSpanNames.h index 965b15ddf4..443171c5c9 100644 --- a/src/xrpld/app/misc/TxSpanNames.h +++ b/src/xrpld/app/misc/TxSpanNames.h @@ -55,6 +55,16 @@ inline constexpr auto suppressed = makeStr("suppressed"); inline constexpr auto txStatus = makeStr("tx_status"); /// "peer_version" — version of peer that sent the tx. inline constexpr auto peerVersion = makeStr("peer_version"); +/// "tx_type" — transaction type name (e.g., "Payment", "OfferCreate"). +inline constexpr auto txType = makeStr("tx_type"); +/// "fee" — transaction fee in drops. +inline constexpr auto fee = makeStr("fee"); +/// "sequence" — transaction sequence number. +inline constexpr auto sequence = makeStr("sequence"); +/// "ter_result" — engine result code after application. +inline constexpr auto terResult = makeStr("ter_result"); +/// "applied" — whether the transaction was applied to the ledger. +inline constexpr auto applied = makeStr("applied"); } // namespace attr // ===== Attribute values ==================================================== diff --git a/src/xrpld/app/misc/detail/TxQ.cpp b/src/xrpld/app/misc/detail/TxQ.cpp index f30c0762d9..352bef6bd9 100644 --- a/src/xrpld/app/misc/detail/TxQ.cpp +++ b/src/xrpld/app/misc/detail/TxQ.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -741,6 +742,8 @@ TxQ::apply( auto span = SpanGuard::span(TraceCategory::Transactions, txq_span::prefix::txq, txq_span::op::enqueue); span.setAttribute(txq_span::attr::txHash, to_string(tx->getTransactionID()).c_str()); + if (auto const* fmt = TxFormats::getInstance().findByType(tx->getTxnType())) + span.setAttribute(txq_span::attr::txType, fmt->getName().c_str()); NumberSO const stNumberSO{view.rules().enabled(fixUniversalNumber)}; @@ -1477,6 +1480,7 @@ TxQ::accept(Application& app, OpenView& view) if (didApply) { + txSpan.setAttribute(txq_span::attr::txqStatus, txq_span::val::applied); // Remove the candidate from the queue JLOG(j_.debug()) << "Queued transaction " << candidateIter->txID << " applied successfully with " << transToken(txnResult) @@ -1497,12 +1501,14 @@ TxQ::accept(Application& app, OpenView& view) { account.dropPenalty = true; } + txSpan.setAttribute(txq_span::attr::txqStatus, txq_span::val::failed); JLOG(j_.debug()) << "Queued transaction " << candidateIter->txID << " failed with " << transToken(txnResult) << ". Remove from queue."; candidateIter = eraseAndAdvance(candidateIter); } else { + txSpan.setAttribute(txq_span::attr::txqStatus, txq_span::val::retried); JLOG(j_.debug()) << "Queued transaction " << candidateIter->txID << " failed with " << transToken(txnResult) << ". Leave in queue." << " Applied: " << didApply << ". Flags: " << candidateIter->flags; @@ -1598,6 +1604,7 @@ TxQ::accept(Application& app, OpenView& view) } } XRPL_ASSERT(byFee_.size() == startingSize, "xrpl::TxQ::accept : byFee size match"); + span.setAttribute(txq_span::attr::ledgerChanged, ledgerChanged); return ledgerChanged; } diff --git a/src/xrpld/app/misc/detail/TxQSpanNames.h b/src/xrpld/app/misc/detail/TxQSpanNames.h index 4268a8f5b4..9292ba1e7c 100644 --- a/src/xrpld/app/misc/detail/TxQSpanNames.h +++ b/src/xrpld/app/misc/detail/TxQSpanNames.h @@ -93,6 +93,8 @@ inline constexpr auto terCode = makeStr("ter_code"); inline constexpr auto retriesRemaining = makeStr("retries_remaining"); /// "num_cleared" — entries cleared in batch. inline constexpr auto numCleared = makeStr("num_cleared"); +/// "tx_type" — transaction type name (e.g., "Payment", "OfferCreate"). +inline constexpr auto txType = makeStr("tx_type"); } // namespace attr // ===== Attribute values ==================================================== diff --git a/src/xrpld/consensus/ConsensusSpanNames.h b/src/xrpld/consensus/ConsensusSpanNames.h index 74a8a74cde..e9b08b8439 100644 --- a/src/xrpld/consensus/ConsensusSpanNames.h +++ b/src/xrpld/consensus/ConsensusSpanNames.h @@ -189,6 +189,11 @@ inline constexpr auto traceStrategy = makeStr("trace_strategy"); inline constexpr auto modeOld = makeStr("mode_old"); inline constexpr auto modeNew = makeStr("mode_new"); +/// "is_bow_out" — whether this proposal is a bow-out (resigning from round). +inline constexpr auto isBowOut = makeStr("is_bow_out"); +/// "ledger_hash" — full hash of the ledger being validated/accepted. +inline constexpr auto ledgerHash = makeStr("ledger_hash"); + /// Transaction/dispute attrs used in consensus accept spans. inline constexpr auto txId = makeStr("tx_id"); inline constexpr auto disputeOurVote = makeStr("dispute_our_vote"); diff --git a/src/xrpld/overlay/detail/PeerImp.cpp b/src/xrpld/overlay/detail/PeerImp.cpp index 33dcf38957..9252ef93af 100644 --- a/src/xrpld/overlay/detail/PeerImp.cpp +++ b/src/xrpld/overlay/detail/PeerImp.cpp @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -1333,6 +1334,8 @@ PeerImp::handleTransaction( auto span = std::make_shared(txReceiveSpan(txID, *m)); span->setAttribute(tx_span::attr::txHash, to_string(txID).c_str()); span->setAttribute(tx_span::attr::peerId, static_cast(id_)); + if (auto const* fmt = TxFormats::getInstance().findByType(stx->getTxnType())) + span->setAttribute(tx_span::attr::txType, fmt->getName().c_str()); if (auto const version = getVersion(); !version.empty()) span->setAttribute(tx_span::attr::peerVersion, version.c_str()); // Set defaults for conditional attributes so they are always present diff --git a/src/xrpld/rpc/detail/PathFindSpanNames.h b/src/xrpld/rpc/detail/PathFindSpanNames.h index 17a1c12858..3a12937fbc 100644 --- a/src/xrpld/rpc/detail/PathFindSpanNames.h +++ b/src/xrpld/rpc/detail/PathFindSpanNames.h @@ -84,6 +84,12 @@ inline constexpr auto numPaths = makeStr("pathfind_num_paths"); inline constexpr auto numRequests = makeStr("pathfind_num_requests"); /// "pathfind_ledger_index" — pathfind target ledger index. inline constexpr auto ledgerIndex = makeStr("pathfind_ledger_index"); +/// "pathfind_dest_amount" — requested destination amount as string. +inline constexpr auto destAmount = makeStr("pathfind_dest_amount"); +/// "pathfind_dest_currency" — destination currency code. +inline constexpr auto destCurrency = makeStr("pathfind_dest_currency"); +/// "pathfind_num_source_assets" — candidate source assets count. +inline constexpr auto numSourceAssets = makeStr("pathfind_num_source_assets"); } // namespace attr } // namespace xrpl::telemetry::pathfind_span diff --git a/src/xrpld/rpc/detail/PathRequest.cpp b/src/xrpld/rpc/detail/PathRequest.cpp index 68769df333..c23bb7cba5 100644 --- a/src/xrpld/rpc/detail/PathRequest.cpp +++ b/src/xrpld/rpc/detail/PathRequest.cpp @@ -594,6 +594,8 @@ PathRequest::findPaths( auto span = SpanGuard::span( TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::discover); span.setAttribute(pathfind_span::attr::searchLevel, static_cast(level)); + span.setAttribute( + pathfind_span::attr::numSourceAssets, static_cast(sourceAssets.size())); std::int64_t totalPaths = 0; for (auto const& asset : sourceAssets) @@ -740,6 +742,8 @@ PathRequest::doUpdate( auto span = SpanGuard::span( TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::compute); span.setAttribute(pathfind_span::attr::fast, fast); + span.setAttribute(pathfind_span::attr::destAmount, saDstAmount_.getFullText().c_str()); + span.setAttribute(pathfind_span::attr::destCurrency, to_string(saDstAmount_.asset()).c_str()); JLOG(journal_.debug()) << iIdentifier_ << " update " << (fast ? "fast" : "normal"); diff --git a/src/xrpld/rpc/detail/RPCHandler.cpp b/src/xrpld/rpc/detail/RPCHandler.cpp index d1fb9e5c93..91ec2f65d8 100644 --- a/src/xrpld/rpc/detail/RPCHandler.cpp +++ b/src/xrpld/rpc/detail/RPCHandler.cpp @@ -185,6 +185,7 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object& JLOG(context.j.debug()) << "RPC call " << name << " completed in " << ((end - start).count() / 1000000000.0) << "seconds"; perfLog.rpcFinish(name, curId); + span.setAttribute(rpc_span::attr::loadType, context.loadType.label().c_str()); // Status::operator bool() returns true when there IS an error // (code_ != OK), so the ternary correctly maps error->error, ok->success. span.setAttribute( diff --git a/src/xrpld/rpc/detail/RpcSpanNames.h b/src/xrpld/rpc/detail/RpcSpanNames.h index bce164cd1e..e7bae84c2f 100644 --- a/src/xrpld/rpc/detail/RpcSpanNames.h +++ b/src/xrpld/rpc/detail/RpcSpanNames.h @@ -144,6 +144,12 @@ inline constexpr auto rpcRole = makeStr("rpc_role"); inline constexpr auto rpcStatus = makeStr("rpc_status"); /// "request_payload_size" — bytes of inbound request payload. inline constexpr auto requestPayloadSize = makeStr("request_payload_size"); +/// "is_batch" — whether request is a JSON-RPC batch. +inline constexpr auto isBatch = makeStr("is_batch"); +/// "batch_size" — number of sub-requests in a batch. +inline constexpr auto batchSize = makeStr("batch_size"); +/// "load_type" — resource cost category after execution. +inline constexpr auto loadType = makeStr("load_type"); } // namespace attr // ===== Attribute values ==================================================== diff --git a/src/xrpld/rpc/detail/ServerHandler.cpp b/src/xrpld/rpc/detail/ServerHandler.cpp index cc7d606d69..36832b4080 100644 --- a/src/xrpld/rpc/detail/ServerHandler.cpp +++ b/src/xrpld/rpc/detail/ServerHandler.cpp @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -428,6 +429,15 @@ ServerHandler::processSession( json::Value const& jv) { auto span = SpanGuard::span(TraceCategory::Rpc, rpc_span::prefix::rpc, rpc_span::op::wsMessage); + if (jv.isMember(jss::command) && jv[jss::command].isString()) + { + span.setAttribute(rpc_span::attr::command, jv[jss::command].asString().c_str()); + } + else if (jv.isMember(jss::method) && jv[jss::method].isString()) + { + span.setAttribute(rpc_span::attr::command, jv[jss::method].asString().c_str()); + } + auto is = std::static_pointer_cast(session->appDefined); if (is->getConsumer().disconnect(journal_)) { @@ -576,9 +586,12 @@ ServerHandler::processSession( auto span = SpanGuard::span(TraceCategory::Rpc, rpc_span::prefix::rpc, rpc_span::op::httpRequest); + auto const requestBody = ::xrpl::buffersToString(session->request().body().data()); + span.setAttribute(rpc_span::attr::requestPayloadSize, static_cast(requestBody.size())); + processRequest( session->port(), - ::xrpl::buffersToString(session->request().body().data()), + requestBody, session->remoteAddress().atPort(0), makeOutput(*session), coro, @@ -657,6 +670,9 @@ ServerHandler::processRequest( } size = jsonOrig[jss::params].size(); } + span.setAttribute(rpc_span::attr::isBatch, batch); + if (batch) + span.setAttribute(rpc_span::attr::batchSize, static_cast(size)); json::Value reply(batch ? json::ValueType::Array : json::ValueType::Object); auto const start(std::chrono::high_resolution_clock::now());