diff --git a/.github/scripts/levelization/results/loops.txt b/.github/scripts/levelization/results/loops.txt index 66906f48c6..16e62bb0a7 100644 --- a/.github/scripts/levelization/results/loops.txt +++ b/.github/scripts/levelization/results/loops.txt @@ -20,7 +20,7 @@ Loop: xrpld.app xrpld.shamap xrpld.shamap > xrpld.app Loop: xrpld.app xrpld.telemetry - xrpld.telemetry == xrpld.app + xrpld.telemetry ~= xrpld.app Loop: xrpld.overlay xrpld.rpc xrpld.rpc ~= xrpld.overlay diff --git a/OpenTelemetryPlan/Phase3_taskList.md b/OpenTelemetryPlan/Phase3_taskList.md index 94de0e9682..18146dff02 100644 --- a/OpenTelemetryPlan/Phase3_taskList.md +++ b/OpenTelemetryPlan/Phase3_taskList.md @@ -166,27 +166,54 @@ ## Task 3.6: Context Propagation in Transaction Relay +**Status**: COMPLETE + **Objective**: Ensure trace context flows correctly when transactions are relayed between peers, creating linked spans across nodes. -**What to do**: +**What was done**: -- Verify the relay path injects trace context: - - When `PeerImp` relays a transaction, the `TMTransaction` message should carry `trace_context` - - When a remote peer receives it, the context is extracted and used as parent +- **TX send side**: `NetworkOPs::apply()` now injects the tx.process span's trace + context into the outgoing `TMTransaction` protobuf before relay, using + `telemetry::injectSpanContext()`. The receiving node's `txReceiveSpan()` (already + wired in PeerImp) extracts the parent span_id and creates the tx.receive span + as a child of the sender's tx.process span. -- Test context propagation: - - Manually verify with 2+ node setup that trace IDs match across nodes - - Confirm parent-child span relationships are correct in Tempo +- **Proposal send/receive**: `RCLConsensus::Adaptor::propose()` injects the + current thread's active span context into the `TMProposeSet` protobuf via + `telemetry::injectToProtobuf()`. PeerImp creates a + `consensus.proposal.receive` span that extracts the sender's trace context + as parent (via `ConsensusReceiveTracing.h`). -- Handle edge cases: - - Missing trace context (older peers): create new root span - - Corrupted trace context: log warning, create new root span - - Sampled-out traces: respect trace flags +- **Validation send/receive**: `RCLConsensus::Adaptor::validate()` injects + the current thread's active span context into the `TMValidation` protobuf. + PeerImp creates a `consensus.validation.receive` span that extracts the + sender's trace context as parent. + +- **Edge cases**: Missing trace context (older peers) degrades gracefully to + standalone spans. Invalid/corrupted context is treated as absent. Trace + flags are propagated and respected. + +**New infrastructure**: + +- `SpanGuard::getTraceBytes()` — extracts raw trace_id/span_id/trace_flags + from a span without exposing OTel types. Safe to call from any thread. +- `PropagationHelpers.h` — `injectSpanContext(SpanGuard&, proto)` bridge + between SpanGuard and protobuf TraceContext. +- `TraceContextPropagator.h` — `injectToProtobuf(ctx, proto)` for + same-thread injection via OTel RuntimeContext (used in propose/validate). +- `ConsensusReceiveTracing.h` — `proposalReceiveSpan()` and + `validationReceiveSpan()` helper functions that create receive spans with + optional parent context extraction from incoming protobuf messages. **Key modified files**: -- `src/xrpld/overlay/detail/PeerImp.cpp` -- `src/xrpld/overlay/detail/OverlayImpl.cpp` (if relay method needs context param) +- `src/xrpld/app/misc/NetworkOPs.cpp` — tx relay injection +- `src/xrpld/app/consensus/RCLConsensus.cpp` — proposal/validation send injection +- `src/xrpld/overlay/detail/PeerImp.cpp` — proposal/validation receive spans +- `include/xrpl/telemetry/SpanGuard.h` — `TraceBytes` struct, `getTraceBytes()` +- `src/libxrpl/telemetry/SpanGuard.cpp` — `getTraceBytes()` implementation +- `src/xrpld/telemetry/PropagationHelpers.h` — inject helpers (new file) +- `src/xrpld/telemetry/ConsensusReceiveTracing.h` — receive span helpers (new file) **Reference**: @@ -390,7 +417,7 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ - [ ] `tx.receive` and `tx.process` spans have deterministic trace_id = `txHash[0:16]` - [ ] All nodes handling the same transaction produce spans under the same trace_id -- [ ] Protobuf `span_id` propagation still works when available (parent-child ordering) +- [x] Protobuf `span_id` propagation still works when available (parent-child ordering) - [ ] Missing protobuf context (old peer) degrades gracefully to sibling spans, not lost traces - [ ] `xrpl.tx.trace_strategy` attribute set to `"deterministic"` on all tx spans - [ ] Trace queryable by tx hash (truncate hash → trace_id → direct lookup in Tempo) @@ -458,9 +485,9 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ **Exit Criteria** (from [06-implementation-phases.md §6.11.3](./06-implementation-phases.md)): -- [ ] Transaction traces span across nodes -- [ ] Trace context in Protocol Buffer messages +- [x] Transaction traces span across nodes +- [x] Trace context in Protocol Buffer messages - [ ] HashRouter deduplication visible in traces - [ ] <5% overhead on transaction throughput -- [ ] Deterministic trace_id: same trace_id for same tx across all nodes -- [ ] Protobuf span_id propagation preserves parent-child ordering when available +- [x] Deterministic trace_id: same trace_id for same tx across all nodes +- [x] Protobuf span_id propagation preserves parent-child ordering when available diff --git a/include/xrpl/telemetry/SpanGuard.h b/include/xrpl/telemetry/SpanGuard.h index 438d766335..097eae2312 100644 --- a/include/xrpl/telemetry/SpanGuard.h +++ b/include/xrpl/telemetry/SpanGuard.h @@ -20,6 +20,7 @@ | + hashSpan(cat, name, hash) [static] | | + hashSpan(cat, name, hash, parent) [static] | | + captureContext() : SpanContext | + | + getTraceBytes() : TraceBytes | | + setAttribute(key, value) | | + setOk() / setError(desc) | | + addEvent(name) | @@ -116,6 +117,7 @@ exposed — all interaction goes through the public methods. */ +#include #include #include #include @@ -133,6 +135,26 @@ namespace xrpl::telemetry { */ enum class TraceCategory { Rpc, Transactions, Consensus, Peer, Ledger }; +/** Raw trace context bytes for cross-node propagation. + + Holds the binary trace_id, span_id, and trace_flags extracted from + an active span. Used by protocol-layer code to inject trace context + into outgoing protobuf messages without depending on OTel types. + + @see SpanGuard::getTraceBytes(), TraceContextPropagator.h +*/ +struct TraceBytes +{ + /// 16-byte W3C trace identifier. + std::array traceId{}; + /// 8-byte span identifier of the current span. + std::array spanId{}; + /// W3C trace flags (bit 0 = sampled). + std::uint8_t traceFlags{0}; + /// True if this struct contains valid data from an active span. + bool valid{false}; +}; + /** Key-value pair for span event attributes. Used by addEvent(name, attrs) to attach structured metadata to events. */ @@ -295,6 +317,18 @@ public: [[nodiscard]] SpanContext captureContext() const; + /** Extract raw trace context bytes from this span for propagation. + + Unlike captureContext() which captures the thread-local runtime + context, this method reads the span's own SpanContext directly. + Safe to call from any thread that holds a reference to this guard. + + @return A TraceBytes struct with valid=true if the span is active + and has a valid context, or valid=false otherwise. + */ + [[nodiscard]] TraceBytes + getTraceBytes() const; + // --- Attribute setters (explicit overloads, no OTel types) --------- /** Set a string attribute. No-op on a null guard. */ @@ -431,6 +465,11 @@ public: { return {}; } + [[nodiscard]] TraceBytes + getTraceBytes() const + { + return {}; + } // NOLINTEND(readability-convert-member-functions-to-static) void diff --git a/include/xrpl/telemetry/TraceContextPropagator.h b/include/xrpl/telemetry/TraceContextPropagator.h index 26c9651c00..d0fb7d576d 100644 --- a/include/xrpl/telemetry/TraceContextPropagator.h +++ b/include/xrpl/telemetry/TraceContextPropagator.h @@ -4,8 +4,14 @@ Provides serialization/deserialization of OTel trace context to/from Protocol Buffer TraceContext messages (P2P cross-node propagation). + Wired into the P2P message flow via PropagationHelpers.h for + TMTransaction, TMProposeSet, and TMValidation messages. Only compiled when XRPL_ENABLE_TELEMETRY is defined. + + @see PropagationHelpers.h (high-level inject helpers), + TxTracing.h (transaction receive-side extraction), + ConsensusReceiveTracing.h (proposal/validation receive-side). */ #ifdef XRPL_ENABLE_TELEMETRY diff --git a/src/libxrpl/telemetry/SpanGuard.cpp b/src/libxrpl/telemetry/SpanGuard.cpp index c8673e6b08..c3e5353d8f 100644 --- a/src/libxrpl/telemetry/SpanGuard.cpp +++ b/src/libxrpl/telemetry/SpanGuard.cpp @@ -311,6 +311,26 @@ SpanGuard::captureContext() const return SpanContext(std::make_shared(ctx)); } +TraceBytes +SpanGuard::getTraceBytes() const +{ + if (!impl_ || !impl_->span) + return {}; + + auto const& spanCtx = impl_->span->GetContext(); + if (!spanCtx.IsValid()) + return {}; + + TraceBytes result; + auto const& tid = spanCtx.trace_id(); + std::memcpy(result.traceId.data(), tid.Id().data(), 16); + auto const& sid = spanCtx.span_id(); + std::memcpy(result.spanId.data(), sid.Id().data(), 8); + result.traceFlags = spanCtx.trace_flags().flags(); + result.valid = true; + return result; +} + // ===== Attribute setters =================================================== void diff --git a/src/xrpld/app/consensus/RCLConsensus.cpp b/src/xrpld/app/consensus/RCLConsensus.cpp index 3901fab87e..a09409ee64 100644 --- a/src/xrpld/app/consensus/RCLConsensus.cpp +++ b/src/xrpld/app/consensus/RCLConsensus.cpp @@ -63,9 +63,14 @@ #include #include #include +#include #include +#ifdef XRPL_ENABLE_TELEMETRY +#include +#endif + #include #include @@ -269,6 +274,16 @@ RCLConsensus::Adaptor::propose(RCLCxPeerPos::Proposal const& proposal) app_.getHashRouter().addSuppression(suppression); + // Inject the current thread's active span context (e.g. the + // consensus round span from Phase 4) so receiving peers can link + // their proposal.receive span as a child of this trace. +#ifdef XRPL_ENABLE_TELEMETRY + { + auto ctx = opentelemetry::context::RuntimeContext::GetCurrent(); + telemetry::injectToProtobuf(ctx, *prop.mutable_trace_context()); + } +#endif + app_.getOverlay().broadcast(prop); } @@ -982,6 +997,14 @@ RCLConsensus::Adaptor::validate(RCLCxLedger const& ledger, RCLTxSet const& txns, // Broadcast to all our peers: protocol::TMValidation val; val.set_validation(serialized.data(), serialized.size()); + // Inject the current thread's active span context so receiving + // peers can link their validation.receive span as a child. +#ifdef XRPL_ENABLE_TELEMETRY + { + auto ctx = opentelemetry::context::RuntimeContext::GetCurrent(); + telemetry::injectToProtobuf(ctx, *val.mutable_trace_context()); + } +#endif app_.getOverlay().broadcast(val); // Publish to all our subscribers: diff --git a/src/xrpld/app/misc/NetworkOPs.cpp b/src/xrpld/app/misc/NetworkOPs.cpp index 17972c8fa6..ff7d24dd26 100644 --- a/src/xrpld/app/misc/NetworkOPs.cpp +++ b/src/xrpld/app/misc/NetworkOPs.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -1703,6 +1704,10 @@ NetworkOPsImp::apply(std::unique_lock& batchLock) tx.set_receivetimestamp( registry_.get().getTimeKeeper().now().time_since_epoch().count()); tx.set_deferred(e.result == terQUEUED); + // Inject the tx.process span's trace context so the + // receiving node can link its tx.receive span as a child. + if (e.span && *e.span) + telemetry::injectSpanContext(*e.span, *tx.mutable_trace_context()); // FIXME: This should be when we received it registry_.get().getOverlay().relay(e.transaction->getID(), tx, *toSkip); e.transaction->setBroadcast(); diff --git a/src/xrpld/app/misc/TxSpanNames.h b/src/xrpld/app/misc/TxSpanNames.h index c4d79ca960..2cfd6527d0 100644 --- a/src/xrpld/app/misc/TxSpanNames.h +++ b/src/xrpld/app/misc/TxSpanNames.h @@ -5,14 +5,14 @@ * Used by PeerImp (overlay) and NetworkOPs (app) for transaction * lifecycle spans. Built on StaticStr/join() from SpanNames.h. * - * Span hierarchy: + * Span hierarchy (cross-node propagation): * - * Node A (sender) Node B (receiver) - * +------------------+ +------------------+ - * | tx.process | protobuf | tx.receive | - * | injectTo | ---------> | extractFrom | - * | Protobuf() | trace_ctx | Protobuf() | - * +------------------+ +------------------+ + * Node A (sender) Node B (receiver) + * +---------------------+ +---------------------+ + * | tx.process | protobuf | tx.receive | + * | injectSpanContext | ---------> | txReceiveSpan() | + * | (PropagationHelp.) | trace_ctx | extracts parent | + * +---------------------+ +---------------------+ */ #include diff --git a/src/xrpld/overlay/detail/PeerImp.cpp b/src/xrpld/overlay/detail/PeerImp.cpp index 3dd83399c9..075e9c4273 100644 --- a/src/xrpld/overlay/detail/PeerImp.cpp +++ b/src/xrpld/overlay/detail/PeerImp.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -1959,10 +1960,9 @@ PeerImp::onMessage(std::shared_ptr const& m) app_.getTimeKeeper().closeTime(), calcNodeID(app_.getValidatorManifests().getMasterKey(publicKey))}); - auto span = std::make_shared(telemetry::SpanGuard::span( - telemetry::TraceCategory::Consensus, - telemetry::seg::consensus, - telemetry::cons_span::op::proposalReceive)); + // Create a receive span that links to the sender's trace context + // (if propagated). shared_ptr keeps it alive across the job boundary. + auto span = std::make_shared(telemetry::proposalReceiveSpan(set)); span->setAttribute(telemetry::cons_span::attr::trusted, isTrusted); span->setAttribute(telemetry::cons_span::attr::round, static_cast(set.proposeseq())); @@ -2545,10 +2545,9 @@ PeerImp::onMessage(std::shared_ptr const& m) return; } - auto span = std::make_shared(telemetry::SpanGuard::span( - telemetry::TraceCategory::Consensus, - telemetry::seg::consensus, - telemetry::cons_span::op::validationReceive)); + // Create a receive span that links to the sender's trace context + // (if propagated). shared_ptr keeps it alive across the job boundary. + auto span = std::make_shared(telemetry::validationReceiveSpan(*m)); span->setAttribute(telemetry::cons_span::attr::trusted, isTrusted); if (val->isFieldPresent(sfLedgerSequence)) { diff --git a/src/xrpld/telemetry/ConsensusReceiveTracing.h b/src/xrpld/telemetry/ConsensusReceiveTracing.h new file mode 100644 index 0000000000..a53f2685f8 --- /dev/null +++ b/src/xrpld/telemetry/ConsensusReceiveTracing.h @@ -0,0 +1,127 @@ +#pragma once + +/** Helper functions for creating consensus receive trace spans. + * + * Encapsulates the logic for creating SpanGuard instances for incoming + * proposal and validation messages with optional protobuf parent + * extraction. When the incoming message carries a TraceContext with a + * valid span_id, the receive span is created as a child of the + * sender's span, enabling cross-node trace correlation. + * + * Dependency diagram: + * + * protocol::TMProposeSet / TMValidation + * | + * v + * proposalReceiveSpan() / validationReceiveSpan() + * | + * +--- has trace_context? ----+ + * | yes | no + * v v + * SpanGuard::span() with SpanGuard::span() + * extracted parent context (standalone span) + * + * When XRPL_ENABLE_TELEMETRY is not defined, the functions return + * no-op SpanGuard instances (zero overhead, zero dependencies). + * + * Usage: + * @code + * // In PeerImp::onMessage(TMProposeSet): + * auto span = telemetry::proposalReceiveSpan(*m); + * span.setAttribute(...); + * @endcode + * + * @note These span names use inline string_view literals. When + * ConsensusSpanNames.h (from Phase 4) is available, callers should + * migrate to using the constexpr constants defined there. + */ + +#include +#include + +namespace xrpl { +namespace telemetry { + +// Inline span name constants for consensus receive spans. +// Phase 4 will provide these via ConsensusSpanNames.h; these are +// temporary definitions for the propagation infrastructure. +namespace detail { +inline constexpr std::string_view proposalReceiveName = "consensus.proposal.receive"; +inline constexpr std::string_view validationReceiveName = "consensus.validation.receive"; +} // namespace detail + +/** Create a "consensus.proposal.receive" span for an incoming proposal. + * + * If the message carries a TraceContext with a valid span_id, the + * receive span is created with the sender's context as parent. + * Otherwise a standalone span is created. + * + * @param msg The incoming TMProposeSet protobuf message. + * @return An active SpanGuard, or a null guard if tracing is disabled. + */ +inline SpanGuard +proposalReceiveSpan([[maybe_unused]] protocol::TMProposeSet const& msg) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (msg.has_trace_context()) + { + auto const& tc = msg.trace_context(); + if (tc.has_span_id() && tc.span_id().size() == 8 && tc.has_trace_id() && + tc.trace_id().size() == 16) + { + // Create a child span using the sender's trace_id and + // span_id as parent. Use hashSpan with the sender's + // trace_id so the receiving span shares the same trace. + return SpanGuard::hashSpan( + TraceCategory::Consensus, + detail::proposalReceiveName, + reinterpret_cast(tc.trace_id().data()), + tc.trace_id().size(), + reinterpret_cast(tc.span_id().data()), + tc.span_id().size(), + tc.has_trace_flags() ? static_cast(tc.trace_flags()) + : std::uint8_t{0}); + } + } +#endif + // No propagated context — create a standalone span. + return SpanGuard::span(TraceCategory::Consensus, "consensus", "proposal.receive"); +} + +/** Create a "consensus.validation.receive" span for an incoming validation. + * + * If the message carries a TraceContext with a valid span_id, the + * receive span is created with the sender's context as parent. + * Otherwise a standalone span is created. + * + * @param msg The incoming TMValidation protobuf message. + * @return An active SpanGuard, or a null guard if tracing is disabled. + */ +inline SpanGuard +validationReceiveSpan([[maybe_unused]] protocol::TMValidation const& msg) +{ +#ifdef XRPL_ENABLE_TELEMETRY + if (msg.has_trace_context()) + { + auto const& tc = msg.trace_context(); + if (tc.has_span_id() && tc.span_id().size() == 8 && tc.has_trace_id() && + tc.trace_id().size() == 16) + { + return SpanGuard::hashSpan( + TraceCategory::Consensus, + detail::validationReceiveName, + reinterpret_cast(tc.trace_id().data()), + tc.trace_id().size(), + reinterpret_cast(tc.span_id().data()), + tc.span_id().size(), + tc.has_trace_flags() ? static_cast(tc.trace_flags()) + : std::uint8_t{0}); + } + } +#endif + // No propagated context — create a standalone span. + return SpanGuard::span(TraceCategory::Consensus, "consensus", "validation.receive"); +} + +} // namespace telemetry +} // namespace xrpl diff --git a/src/xrpld/telemetry/PropagationHelpers.h b/src/xrpld/telemetry/PropagationHelpers.h new file mode 100644 index 0000000000..c051026b74 --- /dev/null +++ b/src/xrpld/telemetry/PropagationHelpers.h @@ -0,0 +1,62 @@ +#pragma once + +/** Helpers for injecting trace context into protobuf messages. + * + * Bridges the gap between SpanGuard (which hides OTel types) and the + * protobuf TraceContext message used for cross-node propagation. + * + * Dependency diagram: + * + * SpanGuard::getTraceBytes() protocol::TraceContext (proto) + * \ / + * +--- TraceBytes -----+ + * | | + * injectSpanContext(span, proto) + * + * @note When XRPL_ENABLE_TELEMETRY is disabled, getTraceBytes() returns + * {.valid=false}, so injectSpanContext becomes a no-op with zero overhead. + * + * Usage: + * @code + * // Send side — inject from a SpanGuard reference: + * protocol::TMTransaction tx; + * // ... populate tx fields ... + * injectSpanContext(mySpanGuard, *tx.mutable_trace_context()); + * overlay.relay(txID, tx, toSkip); + * @endcode + * + * @see ConsensusReceiveTracing.h for receive-side extraction helpers. + * @see TraceContextPropagator.h for low-level OTel context serialization. + */ + +#include +#include + +namespace xrpl { +namespace telemetry { + +/** Inject trace context from an active SpanGuard into a protobuf + * TraceContext message for cross-node propagation. + * + * Reads the span's trace_id, span_id, and trace_flags via + * getTraceBytes() and writes them into the protobuf fields. + * Safe to call from any thread that holds a reference to the span. + * No-op if the span is null or inactive. + * + * @param span The active SpanGuard whose context to propagate. + * @param proto The protobuf TraceContext to populate. + */ +inline void +injectSpanContext(SpanGuard const& span, protocol::TraceContext& proto) +{ + auto const bytes = span.getTraceBytes(); + if (!bytes.valid) + return; + + proto.set_trace_id(bytes.traceId.data(), bytes.traceId.size()); + proto.set_span_id(bytes.spanId.data(), bytes.spanId.size()); + proto.set_trace_flags(bytes.traceFlags); +} + +} // namespace telemetry +} // namespace xrpl