mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-02 16:26:48 +00:00
Merge branch 'pratik/otel-phase1c-rpc-integration' into pratik/otel-phase2-rpc-tracing
This commit is contained in:
3
docs/build/telemetry.md
vendored
3
docs/build/telemetry.md
vendored
@@ -36,7 +36,8 @@ such as Grafana Tempo.
|
||||
Telemetry is **off by default** at both compile time and runtime:
|
||||
|
||||
- **Compile time**: The Conan option `telemetry` and CMake option `telemetry` must be set to `True`/`ON`.
|
||||
When disabled, all tracing macros compile to `((void)0)` with zero overhead.
|
||||
When disabled, all `SpanGuard` calls compile to inline no-ops (defined in `SpanGuard.h`)
|
||||
with zero overhead — no OTel SDK dependency required.
|
||||
- **Runtime**: The `[telemetry]` config section must set `enabled=1`.
|
||||
When disabled at runtime, a no-op implementation is used.
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
// Add new amendments to the top of this list.
|
||||
// Keep it sorted in reverse chronological order.
|
||||
|
||||
XRPL_FIX (Cleanup3_2_0, Supported::no, VoteBehavior::DefaultNo)
|
||||
XRPL_FEATURE(MPTokensV2, Supported::no, VoteBehavior::DefaultNo)
|
||||
XRPL_FIX (Security3_1_3, Supported::no, VoteBehavior::DefaultNo)
|
||||
XRPL_FIX (PermissionedDomainInvariant, Supported::yes, VoteBehavior::DefaultNo)
|
||||
|
||||
@@ -20,7 +20,13 @@
|
||||
* - Per-span attribute keys: bare field name (span name carries the domain).
|
||||
* - Collision qualifier: <domain>_<field> when bare name collides across
|
||||
* domains or with OTel reserved `status` (e.g. rpc_status, grpc_status).
|
||||
* - Resource attribute keys: xrpl.<subsystem>.<field> (process-identity).
|
||||
* - Shared cross-span attributes: <domain>_<field> (underscore) form
|
||||
* (e.g. tx_hash, peer_id, ledger_seq, consensus_round).
|
||||
* - Resource attribute keys: xrpl.<subsystem>.<field> (dotted) form is
|
||||
* RESERVED for process-identity attributes set once at startup on the
|
||||
* OTel resource (e.g. xrpl.network.id, xrpl.network.type). Do not use
|
||||
* this form for span attributes — it parses awkwardly in TraceQL and
|
||||
* blurs the resource/span scope distinction.
|
||||
* - Span prefixes: <subsystem>[.<component>].
|
||||
*/
|
||||
|
||||
|
||||
@@ -3,13 +3,15 @@
|
||||
/** Abstract interface for OpenTelemetry distributed tracing.
|
||||
|
||||
Provides the Telemetry base class that all components use to create trace
|
||||
spans. Two concrete implementations exist, selected at construction time
|
||||
spans. Three concrete implementations exist, selected at construction time
|
||||
by make_Telemetry():
|
||||
|
||||
- TelemetryImpl (Telemetry.cpp): real OTel SDK integration, compiled
|
||||
only when XRPL_ENABLE_TELEMETRY is defined and enabled at runtime.
|
||||
- NullTelemetry (NullTelemetry.cpp): no-op stub used when telemetry is
|
||||
disabled at compile time or runtime.
|
||||
- NullTelemetryOtel (Telemetry.cpp): no-op stub that still depends on
|
||||
the OTel API (used during transition or for testing).
|
||||
|
||||
Inheritance / dependency diagram:
|
||||
|
||||
@@ -35,32 +37,44 @@
|
||||
|
||||
Usage examples:
|
||||
|
||||
1. Check before tracing (typical guard pattern):
|
||||
1. Root span at a subsystem entry point (typical usage):
|
||||
@code
|
||||
auto& telemetry = registry.getTelemetry();
|
||||
if (telemetry.isEnabled() && telemetry.shouldTraceRpc())
|
||||
#include <xrpld/rpc/detail/RpcSpanNames.h>
|
||||
using namespace xrpl::telemetry;
|
||||
|
||||
// In an RPC handler dispatch:
|
||||
auto guard = SpanGuard::span(
|
||||
TraceCategory::Rpc, rpc_span::prefix::command, commandName);
|
||||
guard.setAttribute(rpc_span::attr::command, commandName);
|
||||
// ... process request
|
||||
// guard destructor automatically ends the span on scope exit
|
||||
@endcode
|
||||
|
||||
2. Child span for a sub-operation (scoped child):
|
||||
@code
|
||||
auto parent = SpanGuard::span(TraceCategory::Transactions, "tx", "process");
|
||||
{
|
||||
auto span = telemetry.startSpan("rpc.command.server_info");
|
||||
// ... do work, span ends when shared_ptr refcount drops to 0
|
||||
auto child = parent.childSpan("tx.apply");
|
||||
child.setAttribute("tx_type", txType);
|
||||
// child ends here
|
||||
}
|
||||
@endcode
|
||||
|
||||
2. RAII tracing with SpanGuard (preferred):
|
||||
3. Unrelated span (cross-scope, same thread):
|
||||
@code
|
||||
if (telemetry.isEnabled() && telemetry.shouldTraceRpc())
|
||||
{
|
||||
SpanGuard guard(telemetry.startSpan("rpc.command.submit"));
|
||||
guard.setAttribute("command", "submit");
|
||||
// ... guard ends span automatically on scope exit
|
||||
}
|
||||
// Transactions and RPC can be active simultaneously
|
||||
auto txSpan = SpanGuard::span(TraceCategory::Transactions, "tx", "process");
|
||||
auto rpcSpan = SpanGuard::span(TraceCategory::Rpc, "rpc", "info");
|
||||
// both spans end on scope exit
|
||||
@endcode
|
||||
|
||||
3. Cross-thread context propagation:
|
||||
4. Cross-thread context propagation:
|
||||
@code
|
||||
// On thread A: capture context
|
||||
auto ctx = guard.context();
|
||||
// On thread B: create child span with explicit parent
|
||||
auto child = telemetry.startSpan("async.work", ctx);
|
||||
// Thread A: capture the active context while span is in scope
|
||||
auto ctx = parentGuard.captureContext();
|
||||
|
||||
// Thread B: create child span with explicit parent
|
||||
auto child = SpanGuard::childSpan("async.work", ctx);
|
||||
@endcode
|
||||
|
||||
@note Thread safety: The Telemetry interface is safe for concurrent reads
|
||||
|
||||
@@ -36,6 +36,7 @@
|
||||
#include <opentelemetry/trace/tracer.h>
|
||||
|
||||
#include <string>
|
||||
#include <typeinfo>
|
||||
#include <utility>
|
||||
|
||||
namespace xrpl {
|
||||
@@ -332,7 +333,7 @@ SpanGuard::recordException(std::exception const& e)
|
||||
return;
|
||||
impl_->span->AddEvent(
|
||||
"exception",
|
||||
{{"exception.type", "std::exception"}, {"exception.message", std::string(e.what())}});
|
||||
{{"exception.type", typeid(e).name()}, {"exception.message", std::string(e.what())}});
|
||||
impl_->span->SetStatus(otel_trace::StatusCode::kError, e.what());
|
||||
}
|
||||
|
||||
|
||||
@@ -169,8 +169,7 @@ void
|
||||
GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::Coro> coro)
|
||||
{
|
||||
using namespace telemetry;
|
||||
auto span =
|
||||
SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, grpc_span::op::request);
|
||||
auto span = SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, name_);
|
||||
span.setAttribute(grpc_span::attr::method, name_);
|
||||
|
||||
try
|
||||
@@ -179,6 +178,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
|
||||
bool const isUnlimited = clientIsUnlimited();
|
||||
if (!isUnlimited && usage.disconnect(app_.getJournal("gRPCServer")))
|
||||
{
|
||||
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
|
||||
span.setError(grpc_span::val::resourceExhausted);
|
||||
grpc::Status const status{
|
||||
grpc::StatusCode::RESOURCE_EXHAUSTED, "usage balance exceeds threshold"};
|
||||
@@ -190,6 +190,10 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
|
||||
usage.charge(loadType);
|
||||
auto role = getRole(isUnlimited);
|
||||
|
||||
span.setAttribute(
|
||||
grpc_span::attr::grpcRole,
|
||||
role == Role::ADMIN ? grpc_span::val::admin : grpc_span::val::user);
|
||||
|
||||
{
|
||||
std::stringstream toLog;
|
||||
toLog << "role = " << (int)role;
|
||||
@@ -225,6 +229,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
|
||||
if (conditionMetRes != rpcSUCCESS)
|
||||
{
|
||||
RPC::ErrorInfo const errorInfo = RPC::get_error_info(conditionMetRes);
|
||||
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
|
||||
span.setError(errorInfo.token.c_str());
|
||||
grpc::Status const status{
|
||||
grpc::StatusCode::FAILED_PRECONDITION, errorInfo.message.c_str()};
|
||||
@@ -234,6 +239,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
|
||||
{
|
||||
std::pair<Response, grpc::Status> result = handler_(context);
|
||||
setIsUnlimited(result.first, isUnlimited);
|
||||
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::success);
|
||||
span.setOk();
|
||||
responder_.Finish(result.first, result.second, this);
|
||||
}
|
||||
@@ -241,6 +247,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
|
||||
}
|
||||
catch (std::exception const& ex)
|
||||
{
|
||||
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
|
||||
span.recordException(ex);
|
||||
grpc::Status const status{grpc::StatusCode::INTERNAL, ex.what()};
|
||||
responder_.FinishWithError(status, this);
|
||||
|
||||
@@ -9,13 +9,16 @@
|
||||
* Span hierarchy:
|
||||
*
|
||||
* +-------------------------------------------------------+
|
||||
* | grpc.request |
|
||||
* | grpc.<MethodName> (e.g. grpc.GetLedger) |
|
||||
* | CallData::process(coro) |
|
||||
* | attrs: method, grpc_role, grpc_status |
|
||||
* +-------------------------------------------------------+
|
||||
*
|
||||
* Unlike the HTTP/WS RPC path, gRPC has a flat single-span structure
|
||||
* per request since each CallData handles exactly one RPC method.
|
||||
* The method name is embedded in the span name (rather than only as
|
||||
* an attribute) so dashboards can break out per-method latency and
|
||||
* error rates without needing TraceQL attribute filters.
|
||||
*/
|
||||
|
||||
#include <xrpl/telemetry/SpanNames.h>
|
||||
@@ -25,16 +28,11 @@ namespace xrpl::telemetry::grpc_span {
|
||||
// ===== Span prefixes =======================================================
|
||||
|
||||
namespace prefix {
|
||||
/// "grpc" — root prefix for gRPC transport spans.
|
||||
/// "grpc" — root prefix for gRPC transport spans. The full span name is
|
||||
/// formed at the call site as `grpc.<MethodName>` (see GRPCServer.cpp).
|
||||
inline constexpr auto grpc = makeStr("grpc");
|
||||
} // namespace prefix
|
||||
|
||||
// ===== Span operation suffixes =============================================
|
||||
|
||||
namespace op {
|
||||
inline constexpr auto request = makeStr("request");
|
||||
} // namespace op
|
||||
|
||||
// ===== Attribute keys ======================================================
|
||||
|
||||
namespace attr {
|
||||
@@ -51,6 +49,8 @@ inline constexpr auto grpcStatus = makeStr("grpc_status");
|
||||
namespace val {
|
||||
using telemetry::attr_val::error;
|
||||
using telemetry::attr_val::success;
|
||||
inline constexpr auto admin = makeStr("admin");
|
||||
inline constexpr auto user = makeStr("user");
|
||||
inline constexpr auto resourceExhausted = makeStr("resource_exhausted");
|
||||
inline constexpr auto failedPrecondition = makeStr("failed_precondition");
|
||||
} // namespace val
|
||||
|
||||
@@ -185,7 +185,12 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object&
|
||||
JLOG(context.j.debug()) << "RPC call " << name << " completed in "
|
||||
<< ((end - start).count() / 1000000000.0) << "seconds";
|
||||
perfLog.rpcFinish(name, curId);
|
||||
span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success);
|
||||
// Status::operator bool() returns true when there IS an error
|
||||
// (code_ != OK), so the ternary correctly maps error->error, ok->success.
|
||||
span.setAttribute(
|
||||
rpc_span::attr::rpcStatus, ret ? rpc_span::val::error : rpc_span::val::success);
|
||||
if (!ret)
|
||||
span.setOk();
|
||||
return ret;
|
||||
}
|
||||
catch (std::exception& e)
|
||||
@@ -224,8 +229,11 @@ doCommand(RPC::JsonContext& context, Json::Value& result)
|
||||
{
|
||||
cmdName = "unknown";
|
||||
}
|
||||
auto span = SpanGuard::span(
|
||||
TraceCategory::Rpc, rpc_span::prefix::command, rpc_span::val::unknownCommand);
|
||||
// Use the resolved command name as the span suffix so dashboards
|
||||
// can break out per-command error rates (e.g. rpc.command.submit
|
||||
// for a submit that hit rpcTOO_BUSY). Falling back to a single
|
||||
// "unknown" name only when the request truly omits both fields.
|
||||
auto span = SpanGuard::span(TraceCategory::Rpc, rpc_span::prefix::command, cmdName);
|
||||
span.setAttribute(rpc_span::attr::command, cmdName.c_str());
|
||||
span.setError(get_error_info(error).token.c_str());
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@
|
||||
* gRPC path (see GrpcSpanNames.h for constants):
|
||||
*
|
||||
* +-------------------------------------------------------+
|
||||
* | grpc.request |
|
||||
* | grpc.<MethodName> (e.g. grpc.GetLedger) |
|
||||
* | CallData::process(coro) |
|
||||
* | attrs: method, grpc_status |
|
||||
* +-------------------------------------------------------+
|
||||
|
||||
@@ -564,6 +564,7 @@ ServerHandler::processSession(
|
||||
jr[jss::api_version] = jv[jss::api_version];
|
||||
|
||||
jr[jss::type] = jss::response;
|
||||
span.setOk();
|
||||
return jr;
|
||||
}
|
||||
|
||||
@@ -598,6 +599,7 @@ ServerHandler::processSession(
|
||||
{
|
||||
session->close(true);
|
||||
}
|
||||
span.setOk();
|
||||
}
|
||||
|
||||
static Json::Value
|
||||
@@ -1036,6 +1038,7 @@ ServerHandler::processRequest(
|
||||
}
|
||||
}
|
||||
|
||||
span.setOk();
|
||||
HTTPReply(httpStatus, response, output, rpcJ);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user