Merge branch 'pratik/otel-phase1c-rpc-integration' into pratik/otel-phase2-rpc-tracing

This commit is contained in:
Pratik Mankawde
2026-05-27 18:27:11 +01:00
10 changed files with 75 additions and 36 deletions

View File

@@ -36,7 +36,8 @@ such as Grafana Tempo.
Telemetry is **off by default** at both compile time and runtime:
- **Compile time**: The Conan option `telemetry` and CMake option `telemetry` must be set to `True`/`ON`.
When disabled, all tracing macros compile to `((void)0)` with zero overhead.
When disabled, all `SpanGuard` calls compile to inline no-ops (defined in `SpanGuard.h`)
with zero overhead — no OTel SDK dependency required.
- **Runtime**: The `[telemetry]` config section must set `enabled=1`.
When disabled at runtime, a no-op implementation is used.

View File

@@ -15,7 +15,6 @@
// Add new amendments to the top of this list.
// Keep it sorted in reverse chronological order.
XRPL_FIX (Cleanup3_2_0, Supported::no, VoteBehavior::DefaultNo)
XRPL_FEATURE(MPTokensV2, Supported::no, VoteBehavior::DefaultNo)
XRPL_FIX (Security3_1_3, Supported::no, VoteBehavior::DefaultNo)
XRPL_FIX (PermissionedDomainInvariant, Supported::yes, VoteBehavior::DefaultNo)

View File

@@ -20,7 +20,13 @@
* - Per-span attribute keys: bare field name (span name carries the domain).
* - Collision qualifier: <domain>_<field> when bare name collides across
* domains or with OTel reserved `status` (e.g. rpc_status, grpc_status).
* - Resource attribute keys: xrpl.<subsystem>.<field> (process-identity).
* - Shared cross-span attributes: <domain>_<field> (underscore) form
* (e.g. tx_hash, peer_id, ledger_seq, consensus_round).
* - Resource attribute keys: xrpl.<subsystem>.<field> (dotted) form is
* RESERVED for process-identity attributes set once at startup on the
* OTel resource (e.g. xrpl.network.id, xrpl.network.type). Do not use
* this form for span attributes — it parses awkwardly in TraceQL and
* blurs the resource/span scope distinction.
* - Span prefixes: <subsystem>[.<component>].
*/

View File

@@ -3,13 +3,15 @@
/** Abstract interface for OpenTelemetry distributed tracing.
Provides the Telemetry base class that all components use to create trace
spans. Two concrete implementations exist, selected at construction time
spans. Three concrete implementations exist, selected at construction time
by make_Telemetry():
- TelemetryImpl (Telemetry.cpp): real OTel SDK integration, compiled
only when XRPL_ENABLE_TELEMETRY is defined and enabled at runtime.
- NullTelemetry (NullTelemetry.cpp): no-op stub used when telemetry is
disabled at compile time or runtime.
- NullTelemetryOtel (Telemetry.cpp): no-op stub that still depends on
the OTel API (used during transition or for testing).
Inheritance / dependency diagram:
@@ -35,32 +37,44 @@
Usage examples:
1. Check before tracing (typical guard pattern):
1. Root span at a subsystem entry point (typical usage):
@code
auto& telemetry = registry.getTelemetry();
if (telemetry.isEnabled() && telemetry.shouldTraceRpc())
#include <xrpld/rpc/detail/RpcSpanNames.h>
using namespace xrpl::telemetry;
// In an RPC handler dispatch:
auto guard = SpanGuard::span(
TraceCategory::Rpc, rpc_span::prefix::command, commandName);
guard.setAttribute(rpc_span::attr::command, commandName);
// ... process request
// guard destructor automatically ends the span on scope exit
@endcode
2. Child span for a sub-operation (scoped child):
@code
auto parent = SpanGuard::span(TraceCategory::Transactions, "tx", "process");
{
auto span = telemetry.startSpan("rpc.command.server_info");
// ... do work, span ends when shared_ptr refcount drops to 0
auto child = parent.childSpan("tx.apply");
child.setAttribute("tx_type", txType);
// child ends here
}
@endcode
2. RAII tracing with SpanGuard (preferred):
3. Unrelated span (cross-scope, same thread):
@code
if (telemetry.isEnabled() && telemetry.shouldTraceRpc())
{
SpanGuard guard(telemetry.startSpan("rpc.command.submit"));
guard.setAttribute("command", "submit");
// ... guard ends span automatically on scope exit
}
// Transactions and RPC can be active simultaneously
auto txSpan = SpanGuard::span(TraceCategory::Transactions, "tx", "process");
auto rpcSpan = SpanGuard::span(TraceCategory::Rpc, "rpc", "info");
// both spans end on scope exit
@endcode
3. Cross-thread context propagation:
4. Cross-thread context propagation:
@code
// On thread A: capture context
auto ctx = guard.context();
// On thread B: create child span with explicit parent
auto child = telemetry.startSpan("async.work", ctx);
// Thread A: capture the active context while span is in scope
auto ctx = parentGuard.captureContext();
// Thread B: create child span with explicit parent
auto child = SpanGuard::childSpan("async.work", ctx);
@endcode
@note Thread safety: The Telemetry interface is safe for concurrent reads

View File

@@ -36,6 +36,7 @@
#include <opentelemetry/trace/tracer.h>
#include <string>
#include <typeinfo>
#include <utility>
namespace xrpl {
@@ -332,7 +333,7 @@ SpanGuard::recordException(std::exception const& e)
return;
impl_->span->AddEvent(
"exception",
{{"exception.type", "std::exception"}, {"exception.message", std::string(e.what())}});
{{"exception.type", typeid(e).name()}, {"exception.message", std::string(e.what())}});
impl_->span->SetStatus(otel_trace::StatusCode::kError, e.what());
}

View File

@@ -169,8 +169,7 @@ void
GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::Coro> coro)
{
using namespace telemetry;
auto span =
SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, grpc_span::op::request);
auto span = SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, name_);
span.setAttribute(grpc_span::attr::method, name_);
try
@@ -179,6 +178,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
bool const isUnlimited = clientIsUnlimited();
if (!isUnlimited && usage.disconnect(app_.getJournal("gRPCServer")))
{
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
span.setError(grpc_span::val::resourceExhausted);
grpc::Status const status{
grpc::StatusCode::RESOURCE_EXHAUSTED, "usage balance exceeds threshold"};
@@ -190,6 +190,10 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
usage.charge(loadType);
auto role = getRole(isUnlimited);
span.setAttribute(
grpc_span::attr::grpcRole,
role == Role::ADMIN ? grpc_span::val::admin : grpc_span::val::user);
{
std::stringstream toLog;
toLog << "role = " << (int)role;
@@ -225,6 +229,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
if (conditionMetRes != rpcSUCCESS)
{
RPC::ErrorInfo const errorInfo = RPC::get_error_info(conditionMetRes);
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
span.setError(errorInfo.token.c_str());
grpc::Status const status{
grpc::StatusCode::FAILED_PRECONDITION, errorInfo.message.c_str()};
@@ -234,6 +239,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
{
std::pair<Response, grpc::Status> result = handler_(context);
setIsUnlimited(result.first, isUnlimited);
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::success);
span.setOk();
responder_.Finish(result.first, result.second, this);
}
@@ -241,6 +247,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
}
catch (std::exception const& ex)
{
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
span.recordException(ex);
grpc::Status const status{grpc::StatusCode::INTERNAL, ex.what()};
responder_.FinishWithError(status, this);

View File

@@ -9,13 +9,16 @@
* Span hierarchy:
*
* +-------------------------------------------------------+
* | grpc.request |
* | grpc.<MethodName> (e.g. grpc.GetLedger) |
* | CallData::process(coro) |
* | attrs: method, grpc_role, grpc_status |
* +-------------------------------------------------------+
*
* Unlike the HTTP/WS RPC path, gRPC has a flat single-span structure
* per request since each CallData handles exactly one RPC method.
* The method name is embedded in the span name (rather than only as
* an attribute) so dashboards can break out per-method latency and
* error rates without needing TraceQL attribute filters.
*/
#include <xrpl/telemetry/SpanNames.h>
@@ -25,16 +28,11 @@ namespace xrpl::telemetry::grpc_span {
// ===== Span prefixes =======================================================
namespace prefix {
/// "grpc" — root prefix for gRPC transport spans.
/// "grpc" — root prefix for gRPC transport spans. The full span name is
/// formed at the call site as `grpc.<MethodName>` (see GRPCServer.cpp).
inline constexpr auto grpc = makeStr("grpc");
} // namespace prefix
// ===== Span operation suffixes =============================================
namespace op {
inline constexpr auto request = makeStr("request");
} // namespace op
// ===== Attribute keys ======================================================
namespace attr {
@@ -51,6 +49,8 @@ inline constexpr auto grpcStatus = makeStr("grpc_status");
namespace val {
using telemetry::attr_val::error;
using telemetry::attr_val::success;
inline constexpr auto admin = makeStr("admin");
inline constexpr auto user = makeStr("user");
inline constexpr auto resourceExhausted = makeStr("resource_exhausted");
inline constexpr auto failedPrecondition = makeStr("failed_precondition");
} // namespace val

View File

@@ -185,7 +185,12 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object&
JLOG(context.j.debug()) << "RPC call " << name << " completed in "
<< ((end - start).count() / 1000000000.0) << "seconds";
perfLog.rpcFinish(name, curId);
span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success);
// Status::operator bool() returns true when there IS an error
// (code_ != OK), so the ternary correctly maps error->error, ok->success.
span.setAttribute(
rpc_span::attr::rpcStatus, ret ? rpc_span::val::error : rpc_span::val::success);
if (!ret)
span.setOk();
return ret;
}
catch (std::exception& e)
@@ -224,8 +229,11 @@ doCommand(RPC::JsonContext& context, Json::Value& result)
{
cmdName = "unknown";
}
auto span = SpanGuard::span(
TraceCategory::Rpc, rpc_span::prefix::command, rpc_span::val::unknownCommand);
// Use the resolved command name as the span suffix so dashboards
// can break out per-command error rates (e.g. rpc.command.submit
// for a submit that hit rpcTOO_BUSY). Falling back to a single
// "unknown" name only when the request truly omits both fields.
auto span = SpanGuard::span(TraceCategory::Rpc, rpc_span::prefix::command, cmdName);
span.setAttribute(rpc_span::attr::command, cmdName.c_str());
span.setError(get_error_info(error).token.c_str());

View File

@@ -86,7 +86,7 @@
* gRPC path (see GrpcSpanNames.h for constants):
*
* +-------------------------------------------------------+
* | grpc.request |
* | grpc.<MethodName> (e.g. grpc.GetLedger) |
* | CallData::process(coro) |
* | attrs: method, grpc_status |
* +-------------------------------------------------------+

View File

@@ -564,6 +564,7 @@ ServerHandler::processSession(
jr[jss::api_version] = jv[jss::api_version];
jr[jss::type] = jss::response;
span.setOk();
return jr;
}
@@ -598,6 +599,7 @@ ServerHandler::processSession(
{
session->close(true);
}
span.setOk();
}
static Json::Value
@@ -1036,6 +1038,7 @@ ServerHandler::processRequest(
}
}
span.setOk();
HTTPReply(httpStatus, response, output, rpcJ);
}