diff --git a/docs/build/telemetry.md b/docs/build/telemetry.md index 1e6e715353..5aa2549327 100644 --- a/docs/build/telemetry.md +++ b/docs/build/telemetry.md @@ -36,7 +36,8 @@ such as Grafana Tempo. Telemetry is **off by default** at both compile time and runtime: - **Compile time**: The Conan option `telemetry` and CMake option `telemetry` must be set to `True`/`ON`. - When disabled, all tracing macros compile to `((void)0)` with zero overhead. + When disabled, all `SpanGuard` calls compile to inline no-ops (defined in `SpanGuard.h`) + with zero overhead — no OTel SDK dependency required. - **Runtime**: The `[telemetry]` config section must set `enabled=1`. When disabled at runtime, a no-op implementation is used. diff --git a/include/xrpl/protocol/detail/features.macro b/include/xrpl/protocol/detail/features.macro index bad43dd6ed..494b3fa6cd 100644 --- a/include/xrpl/protocol/detail/features.macro +++ b/include/xrpl/protocol/detail/features.macro @@ -15,7 +15,6 @@ // Add new amendments to the top of this list. // Keep it sorted in reverse chronological order. -XRPL_FIX (Cleanup3_2_0, Supported::no, VoteBehavior::DefaultNo) XRPL_FEATURE(MPTokensV2, Supported::no, VoteBehavior::DefaultNo) XRPL_FIX (Security3_1_3, Supported::no, VoteBehavior::DefaultNo) XRPL_FIX (PermissionedDomainInvariant, Supported::yes, VoteBehavior::DefaultNo) diff --git a/include/xrpl/telemetry/SpanNames.h b/include/xrpl/telemetry/SpanNames.h index 685e88db6a..8ea05fe2e0 100644 --- a/include/xrpl/telemetry/SpanNames.h +++ b/include/xrpl/telemetry/SpanNames.h @@ -20,7 +20,13 @@ * - Per-span attribute keys: bare field name (span name carries the domain). * - Collision qualifier: _ when bare name collides across * domains or with OTel reserved `status` (e.g. rpc_status, grpc_status). - * - Resource attribute keys: xrpl.. (process-identity). + * - Shared cross-span attributes: _ (underscore) form + * (e.g. tx_hash, peer_id, ledger_seq, consensus_round). + * - Resource attribute keys: xrpl.. (dotted) form is + * RESERVED for process-identity attributes set once at startup on the + * OTel resource (e.g. xrpl.network.id, xrpl.network.type). Do not use + * this form for span attributes — it parses awkwardly in TraceQL and + * blurs the resource/span scope distinction. * - Span prefixes: [.]. */ diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index 090ba602ed..0cb217bd29 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -3,13 +3,15 @@ /** Abstract interface for OpenTelemetry distributed tracing. Provides the Telemetry base class that all components use to create trace - spans. Two concrete implementations exist, selected at construction time + spans. Three concrete implementations exist, selected at construction time by make_Telemetry(): - TelemetryImpl (Telemetry.cpp): real OTel SDK integration, compiled only when XRPL_ENABLE_TELEMETRY is defined and enabled at runtime. - NullTelemetry (NullTelemetry.cpp): no-op stub used when telemetry is disabled at compile time or runtime. + - NullTelemetryOtel (Telemetry.cpp): no-op stub that still depends on + the OTel API (used during transition or for testing). Inheritance / dependency diagram: @@ -35,32 +37,44 @@ Usage examples: - 1. Check before tracing (typical guard pattern): + 1. Root span at a subsystem entry point (typical usage): @code - auto& telemetry = registry.getTelemetry(); - if (telemetry.isEnabled() && telemetry.shouldTraceRpc()) + #include + using namespace xrpl::telemetry; + + // In an RPC handler dispatch: + auto guard = SpanGuard::span( + TraceCategory::Rpc, rpc_span::prefix::command, commandName); + guard.setAttribute(rpc_span::attr::command, commandName); + // ... process request + // guard destructor automatically ends the span on scope exit + @endcode + + 2. Child span for a sub-operation (scoped child): + @code + auto parent = SpanGuard::span(TraceCategory::Transactions, "tx", "process"); { - auto span = telemetry.startSpan("rpc.command.server_info"); - // ... do work, span ends when shared_ptr refcount drops to 0 + auto child = parent.childSpan("tx.apply"); + child.setAttribute("tx_type", txType); + // child ends here } @endcode - 2. RAII tracing with SpanGuard (preferred): + 3. Unrelated span (cross-scope, same thread): @code - if (telemetry.isEnabled() && telemetry.shouldTraceRpc()) - { - SpanGuard guard(telemetry.startSpan("rpc.command.submit")); - guard.setAttribute("command", "submit"); - // ... guard ends span automatically on scope exit - } + // Transactions and RPC can be active simultaneously + auto txSpan = SpanGuard::span(TraceCategory::Transactions, "tx", "process"); + auto rpcSpan = SpanGuard::span(TraceCategory::Rpc, "rpc", "info"); + // both spans end on scope exit @endcode - 3. Cross-thread context propagation: + 4. Cross-thread context propagation: @code - // On thread A: capture context - auto ctx = guard.context(); - // On thread B: create child span with explicit parent - auto child = telemetry.startSpan("async.work", ctx); + // Thread A: capture the active context while span is in scope + auto ctx = parentGuard.captureContext(); + + // Thread B: create child span with explicit parent + auto child = SpanGuard::childSpan("async.work", ctx); @endcode @note Thread safety: The Telemetry interface is safe for concurrent reads diff --git a/src/libxrpl/telemetry/SpanGuard.cpp b/src/libxrpl/telemetry/SpanGuard.cpp index 3e16d12aa5..6dceaafa1d 100644 --- a/src/libxrpl/telemetry/SpanGuard.cpp +++ b/src/libxrpl/telemetry/SpanGuard.cpp @@ -36,6 +36,7 @@ #include #include +#include #include namespace xrpl { @@ -332,7 +333,7 @@ SpanGuard::recordException(std::exception const& e) return; impl_->span->AddEvent( "exception", - {{"exception.type", "std::exception"}, {"exception.message", std::string(e.what())}}); + {{"exception.type", typeid(e).name()}, {"exception.message", std::string(e.what())}}); impl_->span->SetStatus(otel_trace::StatusCode::kError, e.what()); } diff --git a/src/xrpld/app/main/GRPCServer.cpp b/src/xrpld/app/main/GRPCServer.cpp index eeb4797b19..d0019b79f9 100644 --- a/src/xrpld/app/main/GRPCServer.cpp +++ b/src/xrpld/app/main/GRPCServer.cpp @@ -169,8 +169,7 @@ void GRPCServerImpl::CallData::process(std::shared_ptr coro) { using namespace telemetry; - auto span = - SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, grpc_span::op::request); + auto span = SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, name_); span.setAttribute(grpc_span::attr::method, name_); try @@ -179,6 +178,7 @@ GRPCServerImpl::CallData::process(std::shared_ptr::process(std::shared_ptr` (see GRPCServer.cpp). inline constexpr auto grpc = makeStr("grpc"); } // namespace prefix -// ===== Span operation suffixes ============================================= - -namespace op { -inline constexpr auto request = makeStr("request"); -} // namespace op - // ===== Attribute keys ====================================================== namespace attr { @@ -51,6 +49,8 @@ inline constexpr auto grpcStatus = makeStr("grpc_status"); namespace val { using telemetry::attr_val::error; using telemetry::attr_val::success; +inline constexpr auto admin = makeStr("admin"); +inline constexpr auto user = makeStr("user"); inline constexpr auto resourceExhausted = makeStr("resource_exhausted"); inline constexpr auto failedPrecondition = makeStr("failed_precondition"); } // namespace val diff --git a/src/xrpld/rpc/detail/RPCHandler.cpp b/src/xrpld/rpc/detail/RPCHandler.cpp index c19eefec37..1eca97cd52 100644 --- a/src/xrpld/rpc/detail/RPCHandler.cpp +++ b/src/xrpld/rpc/detail/RPCHandler.cpp @@ -185,7 +185,12 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object& JLOG(context.j.debug()) << "RPC call " << name << " completed in " << ((end - start).count() / 1000000000.0) << "seconds"; perfLog.rpcFinish(name, curId); - span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success); + // Status::operator bool() returns true when there IS an error + // (code_ != OK), so the ternary correctly maps error->error, ok->success. + span.setAttribute( + rpc_span::attr::rpcStatus, ret ? rpc_span::val::error : rpc_span::val::success); + if (!ret) + span.setOk(); return ret; } catch (std::exception& e) @@ -224,8 +229,11 @@ doCommand(RPC::JsonContext& context, Json::Value& result) { cmdName = "unknown"; } - auto span = SpanGuard::span( - TraceCategory::Rpc, rpc_span::prefix::command, rpc_span::val::unknownCommand); + // Use the resolved command name as the span suffix so dashboards + // can break out per-command error rates (e.g. rpc.command.submit + // for a submit that hit rpcTOO_BUSY). Falling back to a single + // "unknown" name only when the request truly omits both fields. + auto span = SpanGuard::span(TraceCategory::Rpc, rpc_span::prefix::command, cmdName); span.setAttribute(rpc_span::attr::command, cmdName.c_str()); span.setError(get_error_info(error).token.c_str()); diff --git a/src/xrpld/rpc/detail/RpcSpanNames.h b/src/xrpld/rpc/detail/RpcSpanNames.h index 2be5aa195c..8e2caa633d 100644 --- a/src/xrpld/rpc/detail/RpcSpanNames.h +++ b/src/xrpld/rpc/detail/RpcSpanNames.h @@ -86,7 +86,7 @@ * gRPC path (see GrpcSpanNames.h for constants): * * +-------------------------------------------------------+ - * | grpc.request | + * | grpc. (e.g. grpc.GetLedger) | * | CallData::process(coro) | * | attrs: method, grpc_status | * +-------------------------------------------------------+ diff --git a/src/xrpld/rpc/detail/ServerHandler.cpp b/src/xrpld/rpc/detail/ServerHandler.cpp index a181ec56cd..4f1918076c 100644 --- a/src/xrpld/rpc/detail/ServerHandler.cpp +++ b/src/xrpld/rpc/detail/ServerHandler.cpp @@ -564,6 +564,7 @@ ServerHandler::processSession( jr[jss::api_version] = jv[jss::api_version]; jr[jss::type] = jss::response; + span.setOk(); return jr; } @@ -598,6 +599,7 @@ ServerHandler::processSession( { session->close(true); } + span.setOk(); } static Json::Value @@ -1036,6 +1038,7 @@ ServerHandler::processRequest( } } + span.setOk(); HTTPReply(httpStatus, response, output, rpcJ); }