diff --git a/OpenTelemetryPlan/03-implementation-strategy.md b/OpenTelemetryPlan/03-implementation-strategy.md index f607ef5f3b..19a5114770 100644 --- a/OpenTelemetryPlan/03-implementation-strategy.md +++ b/OpenTelemetryPlan/03-implementation-strategy.md @@ -12,10 +12,10 @@ The telemetry implementation follows rippled's existing code organization patter ``` include/xrpl/ ├── telemetry/ -│ ├── Telemetry.h # Main telemetry interface +│ ├── Telemetry.h # Main telemetry interface (global singleton) │ ├── TelemetryConfig.h # Configuration structures │ ├── TraceContext.h # Context propagation utilities -│ ├── SpanGuard.h # RAII span management with discard() +│ ├── SpanGuard.h # RAII span management with factory methods + discard() │ ├── DiscardFlag.h # Thread-local discard flag │ └── SpanAttributes.h # Attribute helper functions @@ -25,11 +25,6 @@ src/libxrpl/ │ ├── TelemetryConfig.cpp # Config parsing │ ├── TraceContext.cpp # Context serialization │ └── NullTelemetry.cpp # No-op implementation - -src/xrpld/ -├── telemetry/ -│ ├── TracingInstrumentation.h # Instrumentation macros -│ └── TracingInstrumentation.cpp ``` --- @@ -315,20 +310,20 @@ flowchart TD ### 3.7.3 Conditional Instrumentation -```cpp -// Compile-time feature flag -#ifndef XRPL_ENABLE_TELEMETRY -// Zero-cost when disabled -#define XRPL_TRACE_SPAN(t, n) ((void)0) -#endif +SpanGuard's static factory methods handle both compile-time and runtime +checks internally. When `XRPL_ENABLE_TELEMETRY` is not defined, the +entire SpanGuard class compiles to a no-op stub with empty method bodies. +When it is defined, the factory methods check the global Telemetry +instance and the relevant component filter before creating a span: -// Runtime component filtering -if (telemetry.shouldTracePeer()) -{ - XRPL_TRACE_SPAN(telemetry, "peer.message.receive"); - // ... instrumentation -} -// No overhead when component tracing disabled +```cpp +// SpanGuard factory methods handle all conditional logic internally. +// When XRPL_ENABLE_TELEMETRY is not defined, these are no-ops. +// When defined, they check Telemetry::getInstance() and the +// component filter (e.g. shouldTracePeer()) at runtime. +auto span = telemetry::SpanGuard::peerSpan("peer.message.receive"); +span.setAttribute("xrpl.peer.id", peerId); +// No overhead when telemetry is disabled at compile time or runtime ``` --- @@ -351,7 +346,7 @@ This section provides a detailed assessment of how intrusive the OpenTelemetry i | Component | Files Modified | Lines Added | Lines Changed | Architectural Impact | | --------------------- | -------------- | ----------- | ------------- | -------------------- | -| **Core Telemetry** | 5 new files | ~800 | 0 | None (new module) | +| **Core Telemetry** | 7 new files | ~800 | 0 | None (new module) | | **Application Init** | 2 files | ~30 | ~5 | Minimal | | **RPC Layer** | 3 files | ~80 | ~20 | Minimal | | **Transaction Relay** | 4 files | ~120 | ~40 | Low | @@ -361,7 +356,7 @@ This section provides a detailed assessment of how intrusive the OpenTelemetry i | **PathFinding** | 2 | ~80 | ~5 | Minimal | | **TxQ/Fee** | 2 | ~60 | ~5 | Minimal | | **Validator/Amend** | 3 | ~40 | ~5 | Minimal | -| **Total** | **~28 files** | **~1,490** | **~120** | **Low** | +| **Total** | **~27 files** | **~1,490** | **~120** | **Low** | ### 3.9.2 Detailed File Impact @@ -381,16 +376,15 @@ pie title Code Changes by Component #### New Files (No Impact on Existing Code) -| File | Lines | Purpose | -| ---------------------------------------------- | ----- | --------------------------------------- | -| `include/xrpl/telemetry/Telemetry.h` | ~160 | Main interface | -| `include/xrpl/telemetry/SpanGuard.h` | ~120 | RAII wrapper + discard | -| `include/xrpl/telemetry/DiscardFlag.h` | ~28 | Thread-local discard flag | -| `include/xrpl/telemetry/TraceContext.h` | ~80 | Context propagation | -| `src/xrpld/telemetry/TracingInstrumentation.h` | ~60 | Macros | -| `src/libxrpl/telemetry/Telemetry.cpp` | ~400 | Implementation + FilteringSpanProcessor | -| `src/libxrpl/telemetry/TelemetryConfig.cpp` | ~60 | Config parsing | -| `src/libxrpl/telemetry/NullTelemetry.cpp` | ~40 | No-op implementation | +| File | Lines | Purpose | +| ------------------------------------------- | ----- | ----------------------------------------------------- | +| `include/xrpl/telemetry/Telemetry.h` | ~160 | Main interface (global singleton) | +| `include/xrpl/telemetry/SpanGuard.h` | ~250 | RAII wrapper + factory methods + discard + no-op stub | +| `include/xrpl/telemetry/DiscardFlag.h` | ~28 | Thread-local discard flag | +| `include/xrpl/telemetry/TraceContext.h` | ~80 | Context propagation | +| `src/libxrpl/telemetry/Telemetry.cpp` | ~400 | Implementation + FilteringSpanProcessor | +| `src/libxrpl/telemetry/TelemetryConfig.cpp` | ~60 | Config parsing | +| `src/libxrpl/telemetry/NullTelemetry.cpp` | ~40 | No-op implementation | #### Modified Files (Existing Rippled Code) @@ -493,18 +487,24 @@ void ServerHandler::onRequest(...) { send(result); } -// After (only ~10 lines added) +// After (only ~4 lines added) void ServerHandler::onRequest(...) { - XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.request"); // +1 line - XRPL_TRACE_SET_ATTR("xrpl.rpc.command", command); // +1 line + auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); // +1 line + span.setAttribute("xrpl.rpc.command", command); // +1 line auto result = processRequest(req); - XRPL_TRACE_SET_ATTR("xrpl.rpc.status", status); // +1 line + span.setAttribute("xrpl.rpc.status", status); // +1 line send(result); } ``` +SpanGuard factory methods (`rpcSpan`, `txSpan`, `consensusSpan`, etc.) +access the global `Telemetry` instance internally and check the relevant +component filter (`shouldTraceRpc()`, etc.) before creating a span. The +public SpanGuard header has zero `opentelemetry/` includes -- all OTel +types are hidden behind the pimpl idiom. + **Consensus Instrumentation (Medium Intrusiveness):** ```cpp @@ -515,11 +515,11 @@ void RCLConsensusAdaptor::startRound(...) { // After (context storage required) void RCLConsensusAdaptor::startRound(...) { - XRPL_TRACE_CONSENSUS(app_.getTelemetry(), "consensus.round"); - XRPL_TRACE_SET_ATTR("xrpl.consensus.ledger.seq", seq); + auto span = telemetry::SpanGuard::consensusSpan("consensus.round"); + span.setAttribute("xrpl.consensus.ledger.seq", seq); // Store context for child spans in phase transitions - currentRoundContext_ = _xrpl_guard_->context(); // New member variable + currentRoundContext_ = span.context(); // New member variable // ... existing logic unchanged } diff --git a/OpenTelemetryPlan/04-code-samples.md b/OpenTelemetryPlan/04-code-samples.md index 5dfdbc32c1..afc896b4e3 100644 --- a/OpenTelemetryPlan/04-code-samples.md +++ b/OpenTelemetryPlan/04-code-samples.md @@ -181,271 +181,227 @@ setup_Telemetry( --- -## 4.2 RAII Span Guard +## 4.2 RAII Span Guard with Factory Methods + +SpanGuard is a self-contained RAII wrapper that creates, activates, and +ends trace spans. It uses the pimpl idiom to hide all OpenTelemetry +types -- the public header has **zero `opentelemetry/` includes**. +Callers never interact with OTel SDK types directly. + +SpanGuard provides static factory methods (`rpcSpan()`, `txSpan()`, +`consensusSpan()`, etc.) that access the global `Telemetry` singleton +internally. Each factory checks both the runtime enable flag and the +relevant component filter before creating a span. + +When `XRPL_ENABLE_TELEMETRY` is **not** defined, the entire SpanGuard +class compiles to a no-op stub with empty inline method bodies, giving +zero compile-time and runtime cost. ```cpp // include/xrpl/telemetry/SpanGuard.h +// +// Public API -- no opentelemetry/ includes. +// OTel types are hidden behind the pimpl (Impl struct, defined in the +// #ifdef XRPL_ENABLE_TELEMETRY section at the bottom of the header). #pragma once -#include -#include -#include - +#include #include -#include namespace xrpl { namespace telemetry { -/** - * RAII guard for OpenTelemetry spans. - * - * Automatically ends the span on destruction and makes it the current - * span in the thread-local context. - */ +#ifdef XRPL_ENABLE_TELEMETRY + class SpanGuard { - opentelemetry::nostd::shared_ptr span_; - opentelemetry::trace::Scope scope_; + struct Impl; // pimpl -- defined in .cpp or + std::unique_ptr impl_; // in the guarded section below public: - /** - * Construct guard with span. - * The span becomes the current span in thread-local context. - * - * @note If span is nullptr (e.g., telemetry disabled), the guard - * becomes a no-op. All methods safely check for null before access. - */ - explicit SpanGuard( - opentelemetry::nostd::shared_ptr span) - : span_(span ? std::move(span) : nullptr) - , scope_(span_ ? opentelemetry::trace::Scope(span_) - : opentelemetry::trace::Scope( - opentelemetry::nostd::shared_ptr< - opentelemetry::trace::Span>(nullptr))) - { - } + // ═══════════════════════════════════════════════════════════════════ + // FACTORY METHODS (access global Telemetry internally) + // ═══════════════════════════════════════════════════════════════════ - // Non-copyable, non-movable + /** Create a span for RPC request handling. + * Returns a no-op guard if telemetry is disabled or + * shouldTraceRpc() is false. + */ + static SpanGuard rpcSpan(std::string_view name); + + /** Create a span for transaction processing. */ + static SpanGuard txSpan(std::string_view name); + + /** Create a span for consensus rounds. */ + static SpanGuard consensusSpan(std::string_view name); + + /** Create a span for peer-to-peer messages. */ + static SpanGuard peerSpan(std::string_view name); + + /** Create a span for ledger operations. */ + static SpanGuard ledgerSpan(std::string_view name); + + /** Create an uncategorized span (always created when enabled). */ + static SpanGuard span(std::string_view name); + + // ═══════════════════════════════════════════════════════════════════ + // INSTANCE METHODS + // ═══════════════════════════════════════════════════════════════════ + + SpanGuard(); // constructs a no-op guard + ~SpanGuard(); + SpanGuard(SpanGuard&& other) noexcept; + SpanGuard& operator=(SpanGuard&&) = delete; SpanGuard(SpanGuard const&) = delete; SpanGuard& operator=(SpanGuard const&) = delete; - SpanGuard(SpanGuard&&) = delete; - SpanGuard& operator=(SpanGuard&&) = delete; - ~SpanGuard() - { - if (span_) - span_->End(); - } + /** Mark the span status as OK. */ + void setOk(); - /** Access the underlying span */ - opentelemetry::trace::Span& span() { return *span_; } - opentelemetry::trace::Span const& span() const { return *span_; } + /** Set an explicit status code. */ + void setStatus(int code, std::string_view description = ""); - /** Set span status to OK */ - void setOk() - { - span_->SetStatus(opentelemetry::trace::StatusCode::kOk); - } - - /** Set span status with code and description */ - void setStatus( - opentelemetry::trace::StatusCode code, - std::string_view description = "") - { - span_->SetStatus(code, std::string(description)); - } - - /** Set an attribute on the span */ + /** Set a key-value attribute on the span. */ template - void setAttribute(std::string_view key, T&& value) - { - span_->SetAttribute( - opentelemetry::nostd::string_view(key.data(), key.size()), - std::forward(value)); - } + void setAttribute(std::string_view key, T&& value); - /** Add an event to the span */ - void addEvent(std::string_view name) - { - span_->AddEvent(std::string(name)); - } + /** Add an event to the span timeline. */ + void addEvent(std::string_view name); - /** Record an exception on the span */ - void recordException(std::exception const& e) - { - span_->RecordException(e); - span_->SetStatus( - opentelemetry::trace::StatusCode::kError, - e.what()); - } + /** Record an exception and set error status. */ + void recordException(std::exception const& e); - /** Get the current trace context */ - opentelemetry::context::Context context() const - { - return opentelemetry::context::RuntimeContext::GetCurrent(); - } + /** Get the current trace context (for cross-thread propagation). */ + // Returns an opaque context handle. + auto context() const; + + /** Discard this span -- dropped before export. */ + void discard(); }; -/** - * No-op span guard for when tracing is disabled. - * Provides the same interface but does nothing. - */ -class NullSpanGuard +#else // XRPL_ENABLE_TELEMETRY not defined -- zero-cost stub + +class SpanGuard { public: - NullSpanGuard() = default; + // Factory methods -- all return no-op guards + static SpanGuard rpcSpan(std::string_view) { return {}; } + static SpanGuard txSpan(std::string_view) { return {}; } + static SpanGuard consensusSpan(std::string_view) { return {}; } + static SpanGuard peerSpan(std::string_view) { return {}; } + static SpanGuard ledgerSpan(std::string_view) { return {}; } + static SpanGuard span(std::string_view) { return {}; } + // Instance methods -- all no-ops void setOk() {} - void setStatus(opentelemetry::trace::StatusCode, std::string_view = "") {} + void setStatus(int, std::string_view = "") {} template void setAttribute(std::string_view, T&&) {} void addEvent(std::string_view) {} void recordException(std::exception const&) {} - - /** Return a default empty context (matches SpanGuard interface) */ - opentelemetry::context::Context context() const - { - return opentelemetry::context::Context{}; - } + void discard() {} }; +#endif // XRPL_ENABLE_TELEMETRY + } // namespace telemetry } // namespace xrpl ``` --- -## 4.3 Instrumentation Macros +## 4.3 SpanGuard API Reference + +The previous macro-based approach (`TracingInstrumentation.h` with +`XRPL_TRACE_*` macros) has been replaced by SpanGuard's static factory +methods. This eliminates preprocessor macros from instrumentation call +sites and provides a cleaner, type-safe API. + +### 4.3.1 Factory Methods + +Each factory method accesses the global `Telemetry::getInstance()` +singleton internally and checks the corresponding component filter. +If telemetry is disabled (compile-time or runtime) or the component +filter is off, the factory returns a no-op guard whose methods are +all empty inlines. + +| Factory Method | Component Filter | Typical Span Names | +| -------------------------------- | --------------------------- | ------------------------------------ | +| `SpanGuard::rpcSpan(name)` | `shouldTraceRpc()` | `rpc.request`, `rpc.command.submit` | +| `SpanGuard::txSpan(name)` | `shouldTraceTransactions()` | `tx.receive`, `tx.validate` | +| `SpanGuard::consensusSpan(name)` | `shouldTraceConsensus()` | `consensus.round`, `consensus.phase` | +| `SpanGuard::peerSpan(name)` | `shouldTracePeer()` | `peer.message.receive` | +| `SpanGuard::ledgerSpan(name)` | `shouldTraceLedger()` | `ledger.close`, `ledger.accept` | +| `SpanGuard::span(name)` | (always, if enabled) | `job.execute`, custom spans | + +### 4.3.2 Usage Pattern ```cpp -// src/xrpld/telemetry/TracingInstrumentation.h -#pragma once - -#include #include -namespace xrpl { -namespace telemetry { +void ServerHandler::onRequest(...) +{ + // Factory creates a span if RPC tracing is enabled, no-op otherwise. + // No Telemetry& reference needed -- accessed via global singleton. + auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); + span.setAttribute("xrpl.rpc.command", command); -// ═══════════════════════════════════════════════════════════════════════════ -// INSTRUMENTATION MACROS -// ═══════════════════════════════════════════════════════════════════════════ + auto result = processRequest(req); -#ifdef XRPL_ENABLE_TELEMETRY + span.setAttribute("xrpl.rpc.status", result.status()); + span.setOk(); + // span ended automatically when it goes out of scope +} +``` -// Start a span that is automatically ended when guard goes out of scope -#define XRPL_TRACE_SPAN(telemetry, name) \ - auto _xrpl_span_ = (telemetry).startSpan(name); \ - ::xrpl::telemetry::SpanGuard _xrpl_guard_(_xrpl_span_) +### 4.3.3 Compile-Time Disabled Behavior -// Start a span with specific kind -#define XRPL_TRACE_SPAN_KIND(telemetry, name, kind) \ - auto _xrpl_span_ = (telemetry).startSpan(name, kind); \ - ::xrpl::telemetry::SpanGuard _xrpl_guard_(_xrpl_span_) +When `XRPL_ENABLE_TELEMETRY` is **not** defined, SpanGuard compiles to +a zero-cost no-op stub. All factory methods return a default-constructed +guard, and all instance methods have empty bodies: -// Conditional span based on component -#define XRPL_TRACE_TX(telemetry, name) \ - std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ - if ((telemetry).shouldTraceTransactions()) { \ - _xrpl_guard_.emplace((telemetry).startSpan(name)); \ - } +```cpp +// When XRPL_ENABLE_TELEMETRY is not defined: +class SpanGuard +{ +public: + static SpanGuard rpcSpan(std::string_view) { return {}; } + static SpanGuard txSpan(std::string_view) { return {}; } + static SpanGuard consensusSpan(std::string_view) { return {}; } + static SpanGuard peerSpan(std::string_view) { return {}; } + static SpanGuard ledgerSpan(std::string_view) { return {}; } + static SpanGuard span(std::string_view) { return {}; } -#define XRPL_TRACE_CONSENSUS(telemetry, name) \ - std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ - if ((telemetry).shouldTraceConsensus()) { \ - _xrpl_guard_.emplace((telemetry).startSpan(name)); \ - } + void setOk() {} + void setStatus(int, std::string_view = "") {} + template + void setAttribute(std::string_view, T&&) {} + void addEvent(std::string_view) {} + void recordException(std::exception const&) {} + void discard() {} +}; +``` -#define XRPL_TRACE_RPC(telemetry, name) \ - std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ - if ((telemetry).shouldTraceRpc()) { \ - _xrpl_guard_.emplace((telemetry).startSpan(name)); \ - } +The compiler optimizes away all calls to these empty methods, producing +the same binary as if no instrumentation code were present. -#define XRPL_TRACE_PEER(telemetry, name) \ - std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ - if ((telemetry).shouldTracePeer()) { \ - _xrpl_guard_.emplace((telemetry).startSpan(name)); \ - } +### 4.3.4 Discard Support -#define XRPL_TRACE_LEDGER(telemetry, name) \ - std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ - if ((telemetry).shouldTraceLedger()) { \ - _xrpl_guard_.emplace((telemetry).startSpan(name)); \ - } +SpanGuard supports discarding a span before it is exported. This is +useful for filtering out uninteresting spans (e.g. successful +preflight checks) after the span has been started: -#define XRPL_TRACE_PATHFIND(telemetry, name) \ - std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ - if ((telemetry).shouldTracePathfind()) { \ - _xrpl_guard_.emplace((telemetry).startSpan(name)); \ - } - -#define XRPL_TRACE_TXQ(telemetry, name) \ - std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ - if ((telemetry).shouldTraceTxQ()) { \ - _xrpl_guard_.emplace((telemetry).startSpan(name)); \ - } - -#define XRPL_TRACE_VALIDATOR(telemetry, name) \ - std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ - if ((telemetry).shouldTraceValidator()) { \ - _xrpl_guard_.emplace((telemetry).startSpan(name)); \ - } - -#define XRPL_TRACE_AMENDMENT(telemetry, name) \ - std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ - if ((telemetry).shouldTraceAmendment()) { \ - _xrpl_guard_.emplace((telemetry).startSpan(name)); \ - } - -// Set attribute on current span (if exists). -// Works with both std::optional (from conditional macros) -// and bare SpanGuard (from XRPL_TRACE_SPAN). Uses 'if constexpr'-like -// dispatch via a helper that checks for .has_value(). -#define XRPL_TRACE_SET_ATTR(key, value) \ - do { \ - if constexpr (requires { _xrpl_guard_.has_value(); }) { \ - if (_xrpl_guard_.has_value()) \ - _xrpl_guard_->setAttribute(key, value); \ - } else { \ - _xrpl_guard_.setAttribute(key, value); \ - } \ - } while(0) - -// Record exception on current span -#define XRPL_TRACE_EXCEPTION(e) \ - do { \ - if constexpr (requires { _xrpl_guard_.has_value(); }) { \ - if (_xrpl_guard_.has_value()) \ - _xrpl_guard_->recordException(e); \ - } else { \ - _xrpl_guard_.recordException(e); \ - } \ - } while(0) - -#else // XRPL_ENABLE_TELEMETRY not defined - -#define XRPL_TRACE_SPAN(telemetry, name) ((void)0) -#define XRPL_TRACE_SPAN_KIND(telemetry, name, kind) ((void)0) -#define XRPL_TRACE_TX(telemetry, name) ((void)0) -#define XRPL_TRACE_CONSENSUS(telemetry, name) ((void)0) -#define XRPL_TRACE_RPC(telemetry, name) ((void)0) -#define XRPL_TRACE_PEER(telemetry, name) ((void)0) -#define XRPL_TRACE_LEDGER(telemetry, name) ((void)0) -#define XRPL_TRACE_PATHFIND(telemetry, name) ((void)0) -#define XRPL_TRACE_TXQ(telemetry, name) ((void)0) -#define XRPL_TRACE_VALIDATOR(telemetry, name) ((void)0) -#define XRPL_TRACE_AMENDMENT(telemetry, name) ((void)0) -#define XRPL_TRACE_SET_ATTR(key, value) ((void)0) -#define XRPL_TRACE_EXCEPTION(e) ((void)0) - -#endif // XRPL_ENABLE_TELEMETRY - -} // namespace telemetry -} // namespace xrpl +```cpp +auto span = telemetry::SpanGuard::txSpan("tx.process"); +auto result = preflight(tx); +if (result != tesSUCCESS) +{ + // Span is dropped before entering the batch export queue. + span.discard(); + return result; +} ``` --- @@ -644,7 +600,7 @@ TraceContextPropagator::inject( ```cpp // src/xrpld/overlay/detail/PeerImp.cpp (modified) -#include +#include void PeerImp::handleTransaction( @@ -749,7 +705,7 @@ PeerImp::handleTransaction( ```cpp // src/xrpld/app/consensus/RCLConsensus.cpp (modified) -#include +#include void RCLConsensusAdaptor::startRound( @@ -759,20 +715,18 @@ RCLConsensusAdaptor::startRound( hash_set const& peers, bool proposing) { - XRPL_TRACE_CONSENSUS(app_.getTelemetry(), "consensus.round"); + auto span = telemetry::SpanGuard::consensusSpan("consensus.round"); - XRPL_TRACE_SET_ATTR("xrpl.consensus.ledger.prev", to_string(prevLedgerHash)); - XRPL_TRACE_SET_ATTR("xrpl.consensus.ledger.seq", + span.setAttribute("xrpl.consensus.ledger.prev", to_string(prevLedgerHash)); + span.setAttribute("xrpl.consensus.ledger.seq", static_cast(prevLedger.seq() + 1)); - XRPL_TRACE_SET_ATTR("xrpl.consensus.proposers", + span.setAttribute("xrpl.consensus.proposers", static_cast(peers.size())); - XRPL_TRACE_SET_ATTR("xrpl.consensus.mode", + span.setAttribute("xrpl.consensus.mode", proposing ? "proposing" : "observing"); // Store trace context for use in phase transitions - currentRoundContext_ = _xrpl_guard_.has_value() - ? _xrpl_guard_->context() - : opentelemetry::context::Context{}; + currentRoundContext_ = span.context(); // ... existing implementation ... } @@ -844,34 +798,22 @@ RCLConsensusAdaptor::peerProposal( ```cpp // src/xrpld/rpc/detail/ServerHandler.cpp (modified) -#include +#include void ServerHandler::onRequest( http_request_type&& req, std::function&& send) { - // Extract trace context from HTTP headers (W3C Trace Context) - auto parentCtx = telemetry::TraceContextPropagator::extractFromHeaders( - [&req](std::string_view name) -> std::optional { - // Beast's find() accepts a string_view for custom header lookup - auto it = req.find(name); - if (it != req.end()) - return std::string(it->value()); - return std::nullopt; - }); - - // Start request span - auto span = app_.getTelemetry().startSpan( - "rpc.request", - parentCtx, - opentelemetry::trace::SpanKind::kServer); - telemetry::SpanGuard guard(span); + // SpanGuard::rpcSpan() accesses the global Telemetry instance + // and checks shouldTraceRpc() internally. Returns a no-op guard + // if tracing is disabled. + auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); // Add HTTP attributes - guard.setAttribute("http.method", std::string(req.method_string())); - guard.setAttribute("http.target", std::string(req.target())); - guard.setAttribute("http.user_agent", + span.setAttribute("http.method", std::string(req.method_string())); + span.setAttribute("http.target", std::string(req.target())); + span.setAttribute("http.user_agent", std::string(req[boost::beast::http::field::user_agent])); auto const startTime = std::chrono::steady_clock::now(); @@ -885,8 +827,8 @@ ServerHandler::onRequest( if (!reader.parse(body, jv)) { - guard.setStatus( - opentelemetry::trace::StatusCode::kError, + span.setStatus( + /* kError */ 2, "Invalid JSON"); sendError(send, "Invalid JSON"); return; @@ -899,13 +841,12 @@ ServerHandler::onRequest( ? jv["method"].asString() : "unknown"; - guard.setAttribute("xrpl.rpc.command", command); + span.setAttribute("xrpl.rpc.command", command); // Create child span for command execution - auto cmdSpan = app_.getTelemetry().startSpan( - "rpc.command." + command); { - telemetry::SpanGuard cmdGuard(cmdSpan); + auto cmdSpan = telemetry::SpanGuard::rpcSpan( + "rpc.command." + command); // Execute RPC command auto result = processRequest(jv); @@ -913,42 +854,42 @@ ServerHandler::onRequest( // Record result attributes if (result.isMember("status")) { - cmdGuard.setAttribute("xrpl.rpc.status", + cmdSpan.setAttribute("xrpl.rpc.status", result["status"].asString()); } if (result["status"].asString() == "error") { - cmdGuard.setStatus( - opentelemetry::trace::StatusCode::kError, + cmdSpan.setStatus( + /* kError */ 2, result.isMember("error_message") ? result["error_message"].asString() : "RPC error"); } else { - cmdGuard.setOk(); + cmdSpan.setOk(); } } auto const duration = std::chrono::steady_clock::now() - startTime; - guard.setAttribute("http.duration_ms", + span.setAttribute("http.duration_ms", std::chrono::duration(duration).count()); // Inject trace context into response headers http_response_type resp; telemetry::TraceContextPropagator::injectToHeaders( - guard.context(), + span.context(), [&resp](std::string_view name, std::string_view value) { resp.set(std::string(name), std::string(value)); }); - guard.setOk(); + span.setOk(); send(std::move(resp)); } catch (std::exception const& e) { - guard.recordException(e); + span.recordException(e); JLOG(journal_.error()) << "RPC request failed: " << e.what(); sendError(send, e.what()); } @@ -959,92 +900,40 @@ ServerHandler::onRequest( > **Architecture note**: `JobQueue` and its inner `Workers` class do not > hold an `Application&` or `ServiceRegistry&`. They receive a -> `perf::PerfLog*` at construction. To instrument job execution, a -> `telemetry::Telemetry&` must be threaded into `JobQueue`'s constructor -> alongside the existing `PerfLog&`, or the trace context can be -> captured/restored without starting new spans inside the worker itself. +> `perf::PerfLog*` at construction. Because SpanGuard's factory methods +> access the global `Telemetry` instance directly, no `Telemetry&` +> reference needs to be threaded into `JobQueue`. > > The approach below captures trace context at job-creation time and > restores it when the job executes, so that any spans created _inside_ > the job body automatically become children of the original caller's -> trace. This requires adding a `telemetry::Telemetry&` to `JobQueue`. +> trace. ```cpp -// include/xrpl/core/JobQueue.h (modified) +// src/libxrpl/core/detail/JobQueue.cpp (modified -- processTask) -#ifdef XRPL_ENABLE_TELEMETRY -#include -#endif - -class JobQueue : private Workers::Callback -{ - // ... existing members ... - - // Telemetry reference for job execution spans (added alongside - // the existing perf::PerfLog& member). - telemetry::Telemetry& telemetry_; - -#ifdef XRPL_ENABLE_TELEMETRY - // Per-job trace context captured at addJob() time and restored - // on the worker thread when the job runs. - struct JobContext - { - opentelemetry::context::Context traceCtx; - }; -#endif - -public: - JobQueue( - int threadCount, - beast::insight::Collector::ptr const& collector, - beast::Journal journal, - Logs& logs, - perf::PerfLog& perfLog, - telemetry::Telemetry& telemetry); // New parameter - // ... -}; -``` - -```cpp -// src/libxrpl/core/detail/JobQueue.cpp (modified — processTask) +#include void JobQueue::processTask(int instance) { // ... existing job dequeue logic ... -#ifdef XRPL_ENABLE_TELEMETRY - // Restore the trace context that was captured when the job was - // enqueued. Any spans created inside the job body will become - // children of the original caller's trace. - auto token = opentelemetry::context::RuntimeContext::Attach( - job.traceContext()); - - // Start an execution span if telemetry is enabled at runtime - std::optional guard; - if (telemetry_.isEnabled()) - { - guard.emplace(telemetry_.startSpan("job.execute")); - guard->setAttribute("xrpl.job.type", to_string(job.type())); - guard->setAttribute("xrpl.job.worker", - static_cast(instance)); - } -#endif + // SpanGuard::span() uses the global Telemetry instance -- + // no Telemetry& member needed on JobQueue. + auto span = telemetry::SpanGuard::span("job.execute"); + span.setAttribute("xrpl.job.type", to_string(job.type())); + span.setAttribute("xrpl.job.worker", + static_cast(instance)); try { job.execute(); -#ifdef XRPL_ENABLE_TELEMETRY - if (guard) - guard->setOk(); -#endif + span.setOk(); } catch (std::exception const& e) { -#ifdef XRPL_ENABLE_TELEMETRY - if (guard) - guard->recordException(e); -#endif + span.recordException(e); JLOG(journal_.error()) << "Job execution failed: " << e.what(); } } diff --git a/OpenTelemetryPlan/POC_taskList.md b/OpenTelemetryPlan/POC_taskList.md index decb9f1738..0a80b4795f 100644 --- a/OpenTelemetryPlan/POC_taskList.md +++ b/OpenTelemetryPlan/POC_taskList.md @@ -12,7 +12,7 @@ | [01-architecture-analysis.md](./01-architecture-analysis.md) | RPC request flow (§1.5), key trace points (§1.6), instrumentation priority (§1.7) | | [02-design-decisions.md](./02-design-decisions.md) | SDK selection (§2.1), exporter config (§2.2), span naming (§2.3), attribute schema (§2.4), coexistence with PerfLog/Insight (§2.6) | | [03-implementation-strategy.md](./03-implementation-strategy.md) | Directory structure (§3.1), key principles (§3.2), performance overhead (§3.3-3.6), conditional compilation (§3.7.3), code intrusiveness (§3.9) | -| [04-code-samples.md](./04-code-samples.md) | Telemetry interface (§4.1), SpanGuard (§4.2), macros (§4.3), RPC instrumentation (§4.5.3) | +| [04-code-samples.md](./04-code-samples.md) | Telemetry interface (§4.1), SpanGuard factory methods (§4.2-4.3), RPC instrumentation (§4.5.3) | | [05-configuration-reference.md](./05-configuration-reference.md) | rippled config (§5.1), config parser (§5.2), Application integration (§5.3), CMake (§5.4), Collector config (§5.5), Docker Compose (§5.6), Grafana (§5.8) | | [06-implementation-phases.md](./06-implementation-phases.md) | Phase 1 core tasks (§6.2), Phase 2 RPC tasks (§6.3), quick wins (§6.10), definition of done (§6.11) | | [07-observability-backends.md](./07-observability-backends.md) | Tempo dev setup (§7.1), Grafana dashboards (§7.6), alert rules (§7.6.3) | @@ -147,9 +147,11 @@ - Config parser: `Telemetry::Setup setup_Telemetry(Section const&, std::string const& nodePublicKey, std::string const& version);` - Create `include/xrpl/telemetry/SpanGuard.h`: - - RAII guard that takes an `nostd::shared_ptr`, creates a `Scope`, and calls `span->End()` in destructor. - - Convenience: `setAttribute()`, `setOk()`, `setStatus()`, `addEvent()`, `recordException()`, `context()` - - See [04-code-samples.md](./04-code-samples.md) §4.2 for the full implementation. + - RAII guard with static factory methods (`rpcSpan()`, `txSpan()`, `consensusSpan()`, etc.) that access the global `Telemetry::getInstance()` singleton internally. + - Uses pimpl idiom to hide all OTel types -- the public header has zero `opentelemetry/` includes. + - Convenience instance methods: `setAttribute()`, `setOk()`, `setStatus()`, `addEvent()`, `recordException()`, `context()`, `discard()` + - When `XRPL_ENABLE_TELEMETRY` is not defined, the entire class compiles to a no-op stub. + - See [04-code-samples.md](./04-code-samples.md) §4.2-4.3 for the full API reference. - Create `src/libxrpl/telemetry/NullTelemetry.cpp`: - Implements `Telemetry` with all no-ops. @@ -167,7 +169,7 @@ **Reference**: - [04-code-samples.md §4.1](./04-code-samples.md) — Full `Telemetry` interface with `Setup` struct, lifecycle, tracer access, span creation, and component filtering methods -- [04-code-samples.md §4.2](./04-code-samples.md) — Full `SpanGuard` RAII implementation and `NullSpanGuard` no-op class +- [04-code-samples.md §4.2-4.3](./04-code-samples.md) — SpanGuard with factory methods, pimpl design, no-op stub, and discard support - [03-implementation-strategy.md §3.1](./03-implementation-strategy.md) — Directory structure: `include/xrpl/telemetry/` for headers, `src/libxrpl/telemetry/` for implementation - [03-implementation-strategy.md §3.7.3](./03-implementation-strategy.md) — Conditional instrumentation and zero-cost compile-time disabled pattern @@ -287,47 +289,37 @@ --- -## Task 5: Create Instrumentation Macros +## Task 5: Add SpanGuard Factory Methods -**Objective**: Define convenience macros that make instrumenting code one-liners, and that compile to zero-cost no-ops when telemetry is disabled. +**Objective**: Add static factory methods to SpanGuard that provide type-safe, one-liner instrumentation and compile to zero-cost no-ops when telemetry is disabled. This replaces the earlier macro-based approach (`TracingInstrumentation.h` has been removed). **What to do**: -- Create `src/xrpld/telemetry/TracingInstrumentation.h`: - - When `XRPL_ENABLE_TELEMETRY` is defined: +- Update `include/xrpl/telemetry/SpanGuard.h`: + - Add static factory methods that access the global `Telemetry::getInstance()` singleton and check the relevant component filter before creating a span: ```cpp - #define XRPL_TRACE_SPAN(telemetry, name) \ - auto _xrpl_span_ = (telemetry).startSpan(name); \ - ::xrpl::telemetry::SpanGuard _xrpl_guard_(_xrpl_span_) - - #define XRPL_TRACE_RPC(telemetry, name) \ - std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ - if ((telemetry).shouldTraceRpc()) { \ - _xrpl_guard_.emplace((telemetry).startSpan(name)); \ - } - - #define XRPL_TRACE_SET_ATTR(key, value) \ - if (_xrpl_guard_.has_value()) { \ - _xrpl_guard_->setAttribute(key, value); \ - } - - #define XRPL_TRACE_EXCEPTION(e) \ - if (_xrpl_guard_.has_value()) { \ - _xrpl_guard_->recordException(e); \ - } + // Each factory checks the global Telemetry instance internally. + // No Telemetry& reference needed at the call site. + auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); + span.setAttribute("xrpl.rpc.command", command); + span.setAttribute("xrpl.rpc.status", status); ``` - - When `XRPL_ENABLE_TELEMETRY` is NOT defined, all macros expand to `((void)0)` + - Factory methods: `rpcSpan()`, `txSpan()`, `consensusSpan()`, `peerSpan()`, `ledgerSpan()`, `span()` + - Use the pimpl idiom to hide all OTel types from the public header (zero `opentelemetry/` includes) + - When `XRPL_ENABLE_TELEMETRY` is NOT defined, the entire class compiles to a no-op stub with empty inline method bodies -**Key new file**: +- No separate `TracingInstrumentation.h` file is needed. All instrumentation call sites use `#include ` directly. -- `src/xrpld/telemetry/TracingInstrumentation.h` +**Key modified file**: + +- `include/xrpl/telemetry/SpanGuard.h` **Reference**: -- [04-code-samples.md §4.3](./04-code-samples.md) — Full macro definitions for `XRPL_TRACE_SPAN`, `XRPL_TRACE_RPC`, `XRPL_TRACE_CONSENSUS`, `XRPL_TRACE_SET_ATTR`, `XRPL_TRACE_EXCEPTION` with both enabled and disabled branches -- [03-implementation-strategy.md §3.7.3](./03-implementation-strategy.md) — Conditional instrumentation pattern: compile-time `#ifndef` and runtime `shouldTrace*()` checks +- [04-code-samples.md §4.3](./04-code-samples.md) — SpanGuard API reference: factory methods, usage patterns, compile-time disabled behavior, and discard support +- [03-implementation-strategy.md §3.7.3](./03-implementation-strategy.md) — Conditional instrumentation pattern: factory methods handle compile-time and runtime checks internally - [03-implementation-strategy.md §3.9.7](./03-implementation-strategy.md) — Before/after code examples showing minimal intrusiveness (~1-3 lines per instrumentation point) --- @@ -341,17 +333,17 @@ **What to do**: - Edit `src/xrpld/rpc/detail/ServerHandler.cpp`: - - `#include` the `TracingInstrumentation.h` header + - `#include ` - In `ServerHandler::onRequest(Session& session)`: - - At the top of the method, add: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.request");` - - After the RPC command name is extracted, set attribute: `XRPL_TRACE_SET_ATTR("xrpl.rpc.command", command);` - - After the response status is known, set: `XRPL_TRACE_SET_ATTR("http.status_code", static_cast(statusCode));` - - Wrap error paths with: `XRPL_TRACE_EXCEPTION(e);` + - At the top of the method, add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.request");` + - After the RPC command name is extracted, set attribute: `span.setAttribute("xrpl.rpc.command", command);` + - After the response status is known, set: `span.setAttribute("http.status_code", static_cast(statusCode));` + - Wrap error paths with: `span.recordException(e);` - In `ServerHandler::processRequest(...)`: - - Add a child span: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.process");` - - Set method attribute: `XRPL_TRACE_SET_ATTR("xrpl.rpc.method", request_method);` + - Add a child span: `auto span = telemetry::SpanGuard::rpcSpan("rpc.process");` + - Set method attribute: `span.setAttribute("xrpl.rpc.method", request_method);` - In `ServerHandler::onWSMessage(...)` (WebSocket path): - - Add: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.ws.message");` + - Add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.ws.message");` - The goal is to see spans like: ``` @@ -366,7 +358,7 @@ **Reference**: -- [04-code-samples.md §4.5.3](./04-code-samples.md) — Complete `ServerHandler::onRequest()` instrumented code sample with W3C header extraction, span creation, attribute setting, and error handling +- [04-code-samples.md §4.5.3](./04-code-samples.md) — Complete `ServerHandler::onRequest()` instrumented code sample using SpanGuard factory methods - [01-architecture-analysis.md §1.5](./01-architecture-analysis.md) — RPC request flow diagram: HTTP request -> attributes -> jobqueue.enqueue -> rpc.command -> response - [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.request` in `ServerHandler.cpp::onRequest()` (Priority: High) - [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming convention: `rpc.request`, `rpc.command.*` @@ -382,15 +374,15 @@ **What to do**: - Edit `src/xrpld/rpc/detail/RPCHandler.cpp`: - - `#include` the `TracingInstrumentation.h` header + - `#include ` - In `doCommand(RPC::JsonContext& context, Json::Value& result)`: - - At the top: `XRPL_TRACE_RPC(context.app.getTelemetry(), "rpc.command." + context.method);` + - At the top: `auto span = telemetry::SpanGuard::rpcSpan("rpc.command." + context.method);` - Set attributes: - - `XRPL_TRACE_SET_ATTR("xrpl.rpc.command", context.method);` - - `XRPL_TRACE_SET_ATTR("xrpl.rpc.version", static_cast(context.apiVersion));` - - `XRPL_TRACE_SET_ATTR("xrpl.rpc.role", (context.role == Role::ADMIN) ? "admin" : "user");` - - On success: `XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "success");` - - On error: `XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "error");` and set the error message + - `span.setAttribute("xrpl.rpc.command", context.method);` + - `span.setAttribute("xrpl.rpc.version", static_cast(context.apiVersion));` + - `span.setAttribute("xrpl.rpc.role", (context.role == Role::ADMIN) ? "admin" : "user");` + - On success: `span.setAttribute("xrpl.rpc.status", "success");` + - On error: `span.setAttribute("xrpl.rpc.status", "error");` and set the error message - After this, traces in Tempo/Grafana should look like: ``` @@ -553,7 +545,7 @@ | 2 | Core Telemetry interface + NullImpl | 3 | 0 | 1 | | 3 | OTel-backed Telemetry implementation | 2 | 1 | 1, 2 | | 4 | Application lifecycle integration | 0 | 3 | 2, 3 | -| 5 | Instrumentation macros | 1 | 0 | 2 | +| 5 | SpanGuard factory methods | 0 | 1 | 2 | | 6 | Instrument RPC ServerHandler | 0 | 1 | 4, 5 | | 7 | Instrument RPC command execution | 0 | 1 | 4, 5 | | 8 | End-to-end verification | 0 | 0 | 0-7 | @@ -631,6 +623,6 @@ Issues encountered during POC implementation that inform future work: | Conan package only builds OTLP HTTP exporter, not gRPC | Switched from gRPC to HTTP exporter (`localhost:4318/v1/traces`) | HTTP exporter is the default; gRPC requires custom Conan profile | | CMake target `opentelemetry-cpp::api` etc. don't exist in Conan package | Use umbrella target `opentelemetry-cpp::opentelemetry-cpp` | Conan targets differ from upstream CMake targets | | OTel Collector `logging` exporter deprecated | Renamed to `debug` exporter | Use `debug` in all collector configs going forward | -| Macro parameter `telemetry` collided with `::xrpl::telemetry::` namespace | Renamed macro params to `_tel_obj_`, `_span_name_` | Avoid common words as macro parameter names | +| Macro parameter `telemetry` collided with `::xrpl::telemetry::` namespace | Replaced macros with SpanGuard factory methods (no macros needed) | Factory methods avoid macro hygiene issues entirely | | `opentelemetry::trace::Scope` creates new context on move | Store scope as member, create once in constructor | SpanGuard move semantics need care with Scope lifecycle | | `TracerProviderFactory::Create` returns `unique_ptr`, not `nostd::shared_ptr` | Use `std::shared_ptr` member, wrap in `nostd::shared_ptr` for global provider | OTel SDK factory return types don't match API provider types | diff --git a/cfg/xrpld-example.cfg b/cfg/xrpld-example.cfg index f623bfb555..e41c2f1daf 100644 --- a/cfg/xrpld-example.cfg +++ b/cfg/xrpld-example.cfg @@ -1552,31 +1552,42 @@ validators.txt # # endpoint=http://localhost:4318/v1/traces # -# The OpenTelemetry Collector endpoint (OTLP/HTTP). Default: http://localhost:4318/v1/traces. +# The OTLP/HTTP exporter endpoint. The server sends trace data as +# protobuf-encoded HTTP POST requests to this URL. +# Default: http://localhost:4318/v1/traces. # # sampling_ratio=1.0 # -# Head-based sampling ratio: the fraction of traces to keep, decided at -# span creation time (before the trace completes). Values in [0.0, 1.0]. +# Head-based sampling ratio using TraceIdRatioBasedSampler. The decision +# to record or drop a trace is made at span creation time, before the +# span starts, based on the trace ID. Values in [0.0, 1.0]. # 1.0 = trace everything, 0.1 = sample ~10% of traces. Default: 1.0. +# For tail-based (post-hoc) filtering — where you decide to drop a span +# after inspecting its content — use SpanGuard::discard() in code. # # trace_rpc=1 # -# Enable RPC request tracing. Default: 1. +# Enable tracing for JSON-RPC and WebSocket API request handling — +# command parsing, execution, and response serialization. Default: 1. # # trace_transactions=1 # -# Enable transaction lifecycle tracing. Default: 1. +# Enable tracing for the transaction lifecycle — submission, validation, +# application to ledgers, and final disposition. Default: 1. # # trace_consensus=1 # -# Enable consensus round tracing. Default: 1. +# Enable tracing for the consensus round lifecycle — proposals, +# validations, mode changes, and ledger acceptance. Default: 1. # # trace_peer=0 # -# Enable peer message tracing (high volume). Default: 0. +# Enable tracing for peer-to-peer protocol messages — overlay message +# send/receive, peer handshakes, and routing. High volume; disabled +# by default. Default: 0. # # trace_ledger=1 # -# Enable ledger close/accept tracing. Default: 1. +# Enable tracing for ledger close and accept operations — ledger +# building, state hashing, and write-back to the node store. Default: 1. # diff --git a/cspell.config.yaml b/cspell.config.yaml index f43d6a634e..1708e4bc26 100644 --- a/cspell.config.yaml +++ b/cspell.config.yaml @@ -203,6 +203,7 @@ words: - permdex - perminute - permissioned + - pimpl - pointee - populator - preauth diff --git a/docker/telemetry/docker-compose.yml b/docker/telemetry/docker-compose.yml index ce0f2f3a30..bf74dad9be 100644 --- a/docker/telemetry/docker-compose.yml +++ b/docker/telemetry/docker-compose.yml @@ -67,9 +67,14 @@ services: networks: - xrpld-telemetry +# Named volume for Tempo trace storage (WAL and compacted blocks). +# Data persists across container restarts. Remove with: +# docker compose -f docker/telemetry/docker-compose.yml down -v volumes: tempo-data: +# Isolated bridge network so services communicate by container name +# (e.g., the collector reaches Tempo at http://tempo:4317). networks: xrpld-telemetry: driver: bridge diff --git a/include/xrpl/telemetry/SpanGuard.h b/include/xrpl/telemetry/SpanGuard.h index acc422f862..4d0ce632c2 100644 --- a/include/xrpl/telemetry/SpanGuard.h +++ b/include/xrpl/telemetry/SpanGuard.h @@ -2,19 +2,35 @@ /** RAII guard for OpenTelemetry trace spans. - Wraps an OTel Span and Scope together. On construction, the span is - activated on the current thread's context (via Scope). On destruction, - the span is ended and the previous context is restored. + Wraps an OTel Span and Scope behind the pimpl idiom so that no + opentelemetry headers are exposed in this public header. When + XRPL_ENABLE_TELEMETRY is not defined, SpanGuard is an empty class + with all-inline no-op methods — zero overhead, zero dependencies. Dependency diagram: - +------------------------------------+ - | SpanGuard | - +------------------------------------+ - | - span_ : shared_ptr | - | - scope_ : Scope | - +------------------------------------+ - | uses + +-----------------------------------------+ + | SpanGuard | + +-----------------------------------------+ + | - impl_ : unique_ptr (pimpl) | + +-----------------------------------------+ + | + rpcSpan(name) : SpanGuard [static] | + | + txSpan(name) : SpanGuard [static] | + | + consensusSpan(name) [static] | + | + peerSpan(name) [static] | + | + ledgerSpan(name) [static] | + | + span(name) [static] | + | + childSpan(name) : SpanGuard | + | + linkedSpan(name) : SpanGuard | + | + captureContext() : SpanContext | + | + setAttribute(key, value) | + | + setOk() / setError(desc) | + | + addEvent(name) | + | + recordException(e) | + | + discard() | + | + operator bool() | + +-----------------------------------------+ + | hides (pimpl) +-------+-------+ | | +--------+ +-------------+ @@ -23,223 +39,393 @@ | | | movable) | +--------+ +-------------+ - Used by the XRPL_TRACE_* macros in TracingInstrumentation.h. Can also - be stored in std::optional for conditional tracing (move-constructible). - - Only compiled when XRPL_ENABLE_TELEMETRY is defined. + Static factory methods access the global Telemetry instance + internally (via Telemetry::getInstance()), check whether tracing + is enabled for the requested subsystem, and return either an + active guard or a null (no-op) guard. Callers never need a + Telemetry reference. Usage examples: - 1. Basic RAII tracing: + 1. Basic RPC tracing (factory method): @code - { - SpanGuard guard(telemetry.startSpan("rpc.command.submit")); - guard.setAttribute("xrpl.rpc.command", "submit"); - // ... span is active on this thread's context - } // span ended, previous context restored + auto span = SpanGuard::rpcSpan("rpc.command.submit"); + span.setAttribute("xrpl.rpc.command", "submit"); + span.setAttribute("xrpl.rpc.status", "success"); + // span ended automatically on scope exit @endcode - 2. Conditional tracing with std::optional: + 2. Error recording: @code - std::optional guard; - if (telemetry.isEnabled() && telemetry.shouldTraceRpc()) - guard.emplace(telemetry.startSpan("rpc.request")); - // ... guard may or may not hold a span - @endcode - - 3. Error recording: - @code - SpanGuard guard(telemetry.startSpan("rpc.command.submit")); + auto span = SpanGuard::rpcSpan("rpc.command.submit"); try { - // ... do work - guard.setOk(); + doWork(); + span.setOk(); } catch (std::exception const& e) { - guard.recordException(e); // sets status to error + span.recordException(e); } @endcode - @note Thread safety: A SpanGuard must only be used on the thread where - it was constructed (the Scope binds to the thread-local context stack). - Use context() to propagate the trace to other threads. + 3. Cross-thread context propagation: + @code + // Thread A: create span and capture context + auto span = SpanGuard::consensusSpan("consensus.round"); + auto ctx = span.captureContext(); - @note Limitation: Move assignment is deleted because re-scoping a span - mid-flight would corrupt the context stack. Only move construction is - supported (for std::optional emplacement). + // Thread B: create child with captured context + auto child = SpanGuard::childSpan("consensus.accept", ctx); + @endcode + + 4. Conditional check (rarely needed — methods are no-ops on null): + @code + auto span = SpanGuard::rpcSpan("rpc.request"); + if (span) { + // expensive attribute computation only when active + span.setAttribute("xrpl.rpc.payload_size", computeSize()); + } + @endcode + + 5. Tail-based filtering via discard(): + @code + auto span = SpanGuard::txSpan("tx.process"); + auto result = preflight(tx); + if (result != tesSUCCESS) { + span.discard(); // drop span, never exported + return result; + } + @endcode + + @note Thread safety: A SpanGuard must only be used on the thread + where it was constructed (the internal Scope binds to the + thread-local context stack). Use captureContext() to propagate + the trace to other threads. + + @note Move semantics: Move construction transfers ownership of + the pimpl pointer — no double-Scope issues. Move assignment is + deleted to prevent re-scoping mid-flight. + + @note Known limitations: + - Attributes cannot be removed per the OTel spec; use + setAttribute with an empty value as a convention. + - SpanGuard::span() (raw Span access) is intentionally not + exposed — all interaction goes through the public methods. */ -#ifdef XRPL_ENABLE_TELEMETRY - -#include - -#include -#include -#include -#include - +#include #include +#include #include namespace xrpl { namespace telemetry { +/** Opaque wrapper for an OTel context snapshot. + + Used to propagate trace context across threads. Created by + SpanGuard::captureContext(), consumed by SpanGuard::childSpan() + or SpanGuard::linkedSpan() with an explicit parent/link context. +*/ +class SpanContext +{ + friend class SpanGuard; + +#ifdef XRPL_ENABLE_TELEMETRY + struct Impl; + std::shared_ptr impl_; + explicit SpanContext(std::shared_ptr impl); +#endif + +public: + SpanContext() = default; + + /** @return true if this context holds a valid trace context. */ + bool + isValid() const; +}; + +// --------------------------------------------------------------------------- +// Real implementation (pimpl, compiled in SpanGuard.cpp) +// --------------------------------------------------------------------------- +#ifdef XRPL_ENABLE_TELEMETRY + /** RAII wrapper that activates a span on construction and ends it on - destruction. Non-copyable but move-constructible so it can be held - in std::optional for conditional tracing. + destruction. All OTel types are hidden behind the Impl pointer. + Non-copyable, move-constructible. */ class SpanGuard { - /** The OTel span being guarded. Set to nullptr after move. */ - opentelemetry::nostd::shared_ptr span_; + struct Impl; + std::unique_ptr impl_; - /** Scope that activates span_ on the current thread's context stack. */ - opentelemetry::trace::Scope scope_; + explicit SpanGuard(std::unique_ptr impl); public: - /** Construct a guard that activates @p span on the current context. + /** Construct a null (no-op) guard. All methods are safe to call. */ + SpanGuard(); + ~SpanGuard(); - @param span The span to guard. Ended in the destructor. - */ - explicit SpanGuard(opentelemetry::nostd::shared_ptr span) - : span_(std::move(span)), scope_(span_) - { - } - - /** Non-copyable. Move-constructible to support std::optional. - - The move constructor creates a new Scope from the transferred span, - because Scope is not movable. - */ + SpanGuard(SpanGuard&& other) noexcept; + SpanGuard& + operator=(SpanGuard&&) = delete; SpanGuard(SpanGuard const&) = delete; SpanGuard& operator=(SpanGuard const&) = delete; - SpanGuard(SpanGuard&& other) noexcept : span_(std::move(other.span_)), scope_(span_) - { - other.span_ = nullptr; - } - SpanGuard& - operator=(SpanGuard&&) = delete; - ~SpanGuard() - { - if (span_) - span_->End(); - } + // --- Static factory methods ---------------------------------------- + // Each checks the global Telemetry instance and the corresponding + // shouldTrace*() flag. Returns a null guard if tracing is off. - /** @return A mutable reference to the underlying span. */ - opentelemetry::trace::Span& - span() - { - return *span_; - } + /** Create an unconditional span (always created if telemetry is on). */ + static SpanGuard + span(std::string_view name); - /** @return A const reference to the underlying span. */ - opentelemetry::trace::Span const& - span() const - { - return *span_; - } + /** Create a span guarded by shouldTraceRpc(). */ + static SpanGuard + rpcSpan(std::string_view name); - /** Mark the span status as OK. */ + /** Create a span guarded by shouldTraceTransactions(). */ + static SpanGuard + txSpan(std::string_view name); + + /** Create a span guarded by shouldTraceConsensus(). */ + static SpanGuard + consensusSpan(std::string_view name); + + /** Create a span guarded by shouldTracePeer(). */ + static SpanGuard + peerSpan(std::string_view name); + + /** Create a span guarded by shouldTraceLedger(). */ + static SpanGuard + ledgerSpan(std::string_view name); + + // --- Child / linked span creation ---------------------------------- + + /** Create a child span parented to this guard's active context. + @param name Span name for the child. + @return A new guard, or null if this guard is inactive. + */ + SpanGuard + childSpan(std::string_view name) const; + + /** Create a child span parented to an explicit captured context. + @param name Span name for the child. + @param parentCtx Context captured via captureContext(). + @return A new guard, or null if parentCtx is invalid. + */ + static SpanGuard + childSpan(std::string_view name, SpanContext const& parentCtx); + + /** Create a span linked (follows-from) to this guard's span. + The new span is NOT a child — it starts a new sub-tree but + carries a causal link to this span. + @param name Span name for the linked span. + @return A new guard, or null if this guard is inactive. + */ + SpanGuard + linkedSpan(std::string_view name) const; + + /** Create a span linked to an explicit captured context. + @param name Span name for the linked span. + @param linkCtx Context to link from. + @return A new guard, or null if linkCtx is invalid. + */ + static SpanGuard + linkedSpan(std::string_view name, SpanContext const& linkCtx); + + // --- Context capture ----------------------------------------------- + + /** Snapshot the current thread's OTel context for cross-thread use. + @return An opaque SpanContext, or an invalid one if null guard. + */ + SpanContext + captureContext() const; + + // --- Attribute setters (explicit overloads, no OTel types) --------- + + /** Set a string attribute. No-op on a null guard. */ void - setOk() - { - span_->SetStatus(opentelemetry::trace::StatusCode::kOk); - } + setAttribute(std::string_view key, std::string_view value); - /** Set an explicit status code on the span. + /** Set a string attribute (C-string overload). No-op on a null guard. */ + void + setAttribute(std::string_view key, char const* value); - @param code The OTel status code. - @param description Optional human-readable status description. + /** Set an integer attribute. No-op on a null guard. */ + void + setAttribute(std::string_view key, std::int64_t value); + + /** Set a floating-point attribute. No-op on a null guard. */ + void + setAttribute(std::string_view key, double value); + + /** Set a boolean attribute. No-op on a null guard. */ + void + setAttribute(std::string_view key, bool value); + + // --- Status / events ----------------------------------------------- + + /** Mark the span status as OK. No-op on a null guard. */ + void + setOk(); + + /** Mark the span status as error. No-op on a null guard. + @param description Optional human-readable error description. */ void - setStatus(opentelemetry::trace::StatusCode code, std::string_view description = "") - { - span_->SetStatus(code, std::string(description)); - } - - /** Set a key-value attribute on the span. - - @param key Attribute name (e.g. "xrpl.rpc.command"). - @param value Attribute value (string, int, bool, etc.). - */ - template - void - setAttribute(std::string_view key, T&& value) - { - span_->SetAttribute( - opentelemetry::nostd::string_view(key.data(), key.size()), std::forward(value)); - } - - /** Add a named event to the span's timeline. + setError(std::string_view description = ""); + /** Add a named event to the span's timeline. No-op on a null guard. @param name Event name. */ void - addEvent(std::string_view name) - { - span_->AddEvent(std::string(name)); - } + addEvent(std::string_view name); /** Record an exception as a span event following OTel semantic conventions, and mark the span status as error. - + No-op on a null guard. @param e The exception to record. */ void - recordException(std::exception const& e) - { - span_->AddEvent( - "exception", - {{"exception.type", "std::exception"}, {"exception.message", std::string(e.what())}}); - span_->SetStatus(opentelemetry::trace::StatusCode::kError, e.what()); - } - - /** Return the current OTel context. - - Useful for creating child spans on a different thread by passing - this context to Telemetry::startSpan(name, parentContext). - */ - opentelemetry::context::Context - context() const - { - return opentelemetry::context::RuntimeContext::GetCurrent(); - } + recordException(std::exception const& e); /** Mark this span for discard and end it immediately. - - Sets the tl_discardCurrentSpan thread-local flag before calling - End(). The OTel SDK calls FilteringSpanProcessor::OnEnd() - synchronously on the same thread, where the flag is checked and - cleared. The span is dropped before entering the batch export - queue — never sent over the network or stored. - - After calling discard(), the guard is inert — the destructor will - not call End() again. - - Typical usage: - @code - SpanGuard guard(telemetry.startSpan("tx.process")); - auto result = preflight(tx); - if (result != tesSUCCESS) - { - guard.discard(); - return result; - } - @endcode + The FilteringSpanProcessor drops the span before it enters the + batch export queue. After discard(), the guard is inert. */ void + discard(); + + /** @return true if this guard holds an active span. */ + explicit + operator bool() const; +}; + +// --------------------------------------------------------------------------- +// No-op stub (all inline, zero overhead, no OTel dependency) +// --------------------------------------------------------------------------- +#else // XRPL_ENABLE_TELEMETRY not defined + +class SpanGuard +{ +public: + SpanGuard() = default; + ~SpanGuard() = default; + SpanGuard(SpanGuard&&) noexcept = default; + SpanGuard& + operator=(SpanGuard&&) = delete; + SpanGuard(SpanGuard const&) = delete; + SpanGuard& + operator=(SpanGuard const&) = delete; + + static SpanGuard + span(std::string_view) + { + return {}; + } + static SpanGuard + rpcSpan(std::string_view) + { + return {}; + } + static SpanGuard + txSpan(std::string_view) + { + return {}; + } + static SpanGuard + consensusSpan(std::string_view) + { + return {}; + } + static SpanGuard + peerSpan(std::string_view) + { + return {}; + } + static SpanGuard + ledgerSpan(std::string_view) + { + return {}; + } + + SpanGuard + childSpan(std::string_view) const + { + return {}; + } + static SpanGuard + childSpan(std::string_view, SpanContext const&) + { + return {}; + } + SpanGuard + linkedSpan(std::string_view) const + { + return {}; + } + static SpanGuard + linkedSpan(std::string_view, SpanContext const&) + { + return {}; + } + + SpanContext + captureContext() const + { + return {}; + } + + void + setAttribute(std::string_view, std::string_view) + { + } + void + setAttribute(std::string_view, char const*) + { + } + void + setAttribute(std::string_view, std::int64_t) + { + } + void + setAttribute(std::string_view, double) + { + } + void + setAttribute(std::string_view, bool) + { + } + + void + setOk() + { + } + void + setError(std::string_view = "") + { + } + void + addEvent(std::string_view) + { + } + void + recordException(std::exception const&) + { + } + void discard() { - if (span_) - { - tl_discardCurrentSpan = true; - span_->End(); - span_ = nullptr; - } + } + + explicit + operator bool() const + { + return false; } }; +#endif // XRPL_ENABLE_TELEMETRY + } // namespace telemetry } // namespace xrpl - -#endif // XRPL_ENABLE_TELEMETRY diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index 0599e783ae..6e412fba4e 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -89,7 +89,34 @@ namespace telemetry { class Telemetry { + /** Global singleton pointer, set by start()/stop() in the active + implementation. Allows SpanGuard factory methods to access the + Telemetry instance without callers passing it explicitly. + @see setInstance(), getInstance() + */ + inline static Telemetry* instance_ = nullptr; + public: + /** Get the global Telemetry instance. + @return Pointer to the active instance, or nullptr if not started. + */ + static Telemetry* + getInstance() + { + return instance_; + } + + /** Set the global Telemetry instance. + Called by start()/stop() in concrete implementations. + Tests can call this with a mock to override the global instance. + @param t Pointer to the Telemetry instance, or nullptr to clear. + */ + static void + setInstance(Telemetry* t) + { + instance_ = t; + } + /** Configuration parsed from the [telemetry] section of xrpld.cfg. All fields have sensible defaults so the section can be minimal @@ -119,7 +146,12 @@ public: /** Path to a CA certificate bundle for TLS verification. */ std::string tlsCertPath; - /** Head-based sampling ratio in [0.0, 1.0]. 1.0 = trace everything. */ + /** Head-based sampling ratio in [0.0, 1.0]. 1.0 = trace everything. + This is a head-based (pre-decision) sampler using + TraceIdRatioBasedSampler — the decision to record or drop a + trace is made before the root span starts. For post-hoc + (tail-based) filtering, see SpanGuard::discard(). + */ double samplingRatio = 1.0; /** Maximum number of spans per batch export. */ @@ -224,7 +256,12 @@ public: OpenTelemetry's context propagation. @param name Span name (typically "rpc.command."). - @param kind The span kind (defaults to kInternal). + @param kind The span kind (defaults to kInternal). Possible values: + - kInternal: default, in-process operation + - kServer: incoming synchronous request (e.g. RPC) + - kClient: outgoing synchronous request + - kProducer: async message send (e.g. peer broadcast) + - kConsumer: async message receive @return A shared pointer to the new Span. */ virtual opentelemetry::nostd::shared_ptr diff --git a/src/libxrpl/telemetry/NullTelemetry.cpp b/src/libxrpl/telemetry/NullTelemetry.cpp index 64c8f5e491..6d57f77c69 100644 --- a/src/libxrpl/telemetry/NullTelemetry.cpp +++ b/src/libxrpl/telemetry/NullTelemetry.cpp @@ -39,11 +39,13 @@ public: void start() override { + Telemetry::setInstance(this); } void stop() override { + Telemetry::setInstance(nullptr); } bool diff --git a/src/libxrpl/telemetry/SpanGuard.cpp b/src/libxrpl/telemetry/SpanGuard.cpp new file mode 100644 index 0000000000..bf1cc73d8b --- /dev/null +++ b/src/libxrpl/telemetry/SpanGuard.cpp @@ -0,0 +1,332 @@ +/** Pimpl implementation for SpanGuard and SpanContext. + + All OpenTelemetry SDK types are confined to this translation unit. + The public SpanGuard.h header contains only standard-library types + and forward-declares the Impl struct. + + Static factory methods (rpcSpan, txSpan, etc.) access the global + Telemetry instance via Telemetry::getInstance(), check the relevant + shouldTrace*() flag, and return either an active guard with a real + Span+Scope or a null guard whose methods are all no-ops. + + The Impl struct holds the OTel Span (shared_ptr) and Scope. + Scope is non-movable, but since Impl lives behind a unique_ptr, + SpanGuard's move constructor simply transfers the pointer — no + double-Scope issues. + + @see SpanGuard (SpanGuard.h), Telemetry (Telemetry.h), + FilteringSpanProcessor (Telemetry.cpp) +*/ + +#ifdef XRPL_ENABLE_TELEMETRY + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace xrpl { +namespace telemetry { + +namespace otel_trace = opentelemetry::trace; + +// ===== SpanContext::Impl =================================================== + +struct SpanContext::Impl +{ + opentelemetry::context::Context ctx; + + explicit Impl(opentelemetry::context::Context c) : ctx(std::move(c)) + { + } +}; + +SpanContext::SpanContext(std::shared_ptr impl) : impl_(std::move(impl)) +{ +} + +bool +SpanContext::isValid() const +{ + return impl_ != nullptr; +} + +// ===== SpanGuard::Impl ==================================================== + +struct SpanGuard::Impl +{ + /** The OTel span being guarded. Set to nullptr after discard(). */ + opentelemetry::nostd::shared_ptr span; + + /** Scope that activates span on the current thread's context stack. */ + otel_trace::Scope scope; + + explicit Impl(opentelemetry::nostd::shared_ptr s) + : span(std::move(s)), scope(span) + { + } + + ~Impl() + { + if (span) + span->End(); + } + + Impl(Impl const&) = delete; + Impl& + operator=(Impl const&) = delete; + Impl(Impl&&) = delete; + Impl& + operator=(Impl&&) = delete; +}; + +// ===== SpanGuard core lifecycle ============================================ + +SpanGuard::SpanGuard() = default; +SpanGuard::~SpanGuard() = default; +SpanGuard::SpanGuard(SpanGuard&&) noexcept = default; + +SpanGuard::SpanGuard(std::unique_ptr impl) : impl_(std::move(impl)) +{ +} + +SpanGuard:: +operator bool() const +{ + return impl_ != nullptr; +} + +// ===== Static factory methods ============================================== + +SpanGuard +SpanGuard::span(std::string_view name) +{ + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled()) + return {}; + return SpanGuard(std::make_unique(tel->startSpan(name))); +} + +SpanGuard +SpanGuard::rpcSpan(std::string_view name) +{ + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled() || !tel->shouldTraceRpc()) + return {}; + return SpanGuard(std::make_unique(tel->startSpan(name))); +} + +SpanGuard +SpanGuard::txSpan(std::string_view name) +{ + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled() || !tel->shouldTraceTransactions()) + return {}; + return SpanGuard(std::make_unique(tel->startSpan(name))); +} + +SpanGuard +SpanGuard::consensusSpan(std::string_view name) +{ + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled() || !tel->shouldTraceConsensus()) + return {}; + return SpanGuard(std::make_unique(tel->startSpan(name))); +} + +SpanGuard +SpanGuard::peerSpan(std::string_view name) +{ + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled() || !tel->shouldTracePeer()) + return {}; + return SpanGuard(std::make_unique(tel->startSpan(name))); +} + +SpanGuard +SpanGuard::ledgerSpan(std::string_view name) +{ + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled() || !tel->shouldTraceLedger()) + return {}; + return SpanGuard(std::make_unique(tel->startSpan(name))); +} + +// ===== Child / linked span creation ======================================== + +SpanGuard +SpanGuard::childSpan(std::string_view name) const +{ + if (!impl_) + return {}; + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled()) + return {}; + auto ctx = opentelemetry::context::RuntimeContext::GetCurrent(); + return SpanGuard(std::make_unique(tel->startSpan(name, ctx))); +} + +SpanGuard +SpanGuard::childSpan(std::string_view name, SpanContext const& parentCtx) +{ + if (!parentCtx.isValid()) + return {}; + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled()) + return {}; + return SpanGuard(std::make_unique(tel->startSpan(name, parentCtx.impl_->ctx))); +} + +SpanGuard +SpanGuard::linkedSpan(std::string_view name) const +{ + if (!impl_) + return {}; + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled()) + return {}; + + auto tracer = tel->getTracer("xrpld"); + auto spanCtx = impl_->span->GetContext(); + + otel_trace::StartSpanOptions opts; + return SpanGuard( + std::make_unique(tracer->StartSpan( + std::string(name), {}, {{spanCtx, {{"xrpl.link.type", "follows_from"}}}}, opts))); +} + +SpanGuard +SpanGuard::linkedSpan(std::string_view name, SpanContext const& linkCtx) +{ + if (!linkCtx.isValid()) + return {}; + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled()) + return {}; + + auto tracer = tel->getTracer("xrpld"); + + // Extract the span from the captured context to get its SpanContext. + auto parentSpan = otel_trace::GetSpan(linkCtx.impl_->ctx); + if (!parentSpan || !parentSpan->GetContext().IsValid()) + return {}; + + otel_trace::StartSpanOptions opts; + return SpanGuard( + std::make_unique(tracer->StartSpan( + std::string(name), + {}, + {{parentSpan->GetContext(), {{"xrpl.link.type", "follows_from"}}}}, + opts))); +} + +// ===== Context capture ===================================================== + +SpanContext +SpanGuard::captureContext() const +{ + if (!impl_) + return {}; + auto ctx = opentelemetry::context::RuntimeContext::GetCurrent(); + return SpanContext(std::make_shared(ctx)); +} + +// ===== Attribute setters =================================================== + +void +SpanGuard::setAttribute(std::string_view key, std::string_view value) +{ + if (impl_) + impl_->span->SetAttribute( + opentelemetry::nostd::string_view(key.data(), key.size()), + opentelemetry::nostd::string_view(value.data(), value.size())); +} + +void +SpanGuard::setAttribute(std::string_view key, char const* value) +{ + setAttribute(key, std::string_view(value)); +} + +void +SpanGuard::setAttribute(std::string_view key, std::int64_t value) +{ + if (impl_) + impl_->span->SetAttribute(opentelemetry::nostd::string_view(key.data(), key.size()), value); +} + +void +SpanGuard::setAttribute(std::string_view key, double value) +{ + if (impl_) + impl_->span->SetAttribute(opentelemetry::nostd::string_view(key.data(), key.size()), value); +} + +void +SpanGuard::setAttribute(std::string_view key, bool value) +{ + if (impl_) + impl_->span->SetAttribute(opentelemetry::nostd::string_view(key.data(), key.size()), value); +} + +// ===== Status / events ===================================================== + +void +SpanGuard::setOk() +{ + if (impl_) + impl_->span->SetStatus(otel_trace::StatusCode::kOk); +} + +void +SpanGuard::setError(std::string_view description) +{ + if (impl_) + impl_->span->SetStatus(otel_trace::StatusCode::kError, std::string(description)); +} + +void +SpanGuard::addEvent(std::string_view name) +{ + if (impl_) + impl_->span->AddEvent(std::string(name)); +} + +void +SpanGuard::recordException(std::exception const& e) +{ + if (!impl_) + return; + impl_->span->AddEvent( + "exception", + {{"exception.type", "std::exception"}, {"exception.message", std::string(e.what())}}); + impl_->span->SetStatus(otel_trace::StatusCode::kError, e.what()); +} + +void +SpanGuard::discard() +{ + if (impl_) + { + tl_discardCurrentSpan = true; + impl_->span->End(); + impl_->span = nullptr; // prevent ~Impl from calling End() again + impl_.reset(); + } +} + +} // namespace telemetry +} // namespace xrpl + +#endif // XRPL_ENABLE_TELEMETRY diff --git a/src/libxrpl/telemetry/Telemetry.cpp b/src/libxrpl/telemetry/Telemetry.cpp index 071d4d2c01..6a7a42de8d 100644 --- a/src/libxrpl/telemetry/Telemetry.cpp +++ b/src/libxrpl/telemetry/Telemetry.cpp @@ -146,11 +146,13 @@ public: void start() override { + Telemetry::setInstance(this); } void stop() override { + Telemetry::setInstance(nullptr); } bool @@ -292,6 +294,10 @@ public: trace_api::Provider::SetTracerProvider( opentelemetry::nostd::shared_ptr(sdkProvider_)); + // Register as the global Telemetry instance so SpanGuard factory + // methods can access it without callers passing a reference. + Telemetry::setInstance(this); + JLOG(journal_.info()) << "Telemetry started successfully"; } @@ -299,10 +305,15 @@ public: stop() override { JLOG(journal_.info()) << "Telemetry stopping"; + + // Unregister global instance before tearing down the pipeline. + Telemetry::setInstance(nullptr); + if (sdkProvider_) { - // Force flush before shutdown - sdkProvider_->ForceFlush(); + // Force flush with timeout to avoid blocking indefinitely + // when the OTLP endpoint is unreachable. + sdkProvider_->ForceFlush(std::chrono::milliseconds(5000)); sdkProvider_.reset(); trace_api::Provider::SetTracerProvider( opentelemetry::nostd::shared_ptr(