Merge branch 'pratik/otel-phase1c-rpc-integration' into pratik/otel-phase2-rpc-tracing

This commit is contained in:
Pratik Mankawde
2026-05-27 18:27:11 +01:00
10 changed files with 75 additions and 36 deletions

View File

@@ -36,6 +36,7 @@
#include <opentelemetry/trace/tracer.h>
#include <string>
#include <typeinfo>
#include <utility>
namespace xrpl {
@@ -332,7 +333,7 @@ SpanGuard::recordException(std::exception const& e)
return;
impl_->span->AddEvent(
"exception",
{{"exception.type", "std::exception"}, {"exception.message", std::string(e.what())}});
{{"exception.type", typeid(e).name()}, {"exception.message", std::string(e.what())}});
impl_->span->SetStatus(otel_trace::StatusCode::kError, e.what());
}

View File

@@ -169,8 +169,7 @@ void
GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::Coro> coro)
{
using namespace telemetry;
auto span =
SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, grpc_span::op::request);
auto span = SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, name_);
span.setAttribute(grpc_span::attr::method, name_);
try
@@ -179,6 +178,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
bool const isUnlimited = clientIsUnlimited();
if (!isUnlimited && usage.disconnect(app_.getJournal("gRPCServer")))
{
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
span.setError(grpc_span::val::resourceExhausted);
grpc::Status const status{
grpc::StatusCode::RESOURCE_EXHAUSTED, "usage balance exceeds threshold"};
@@ -190,6 +190,10 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
usage.charge(loadType);
auto role = getRole(isUnlimited);
span.setAttribute(
grpc_span::attr::grpcRole,
role == Role::ADMIN ? grpc_span::val::admin : grpc_span::val::user);
{
std::stringstream toLog;
toLog << "role = " << (int)role;
@@ -225,6 +229,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
if (conditionMetRes != rpcSUCCESS)
{
RPC::ErrorInfo const errorInfo = RPC::get_error_info(conditionMetRes);
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
span.setError(errorInfo.token.c_str());
grpc::Status const status{
grpc::StatusCode::FAILED_PRECONDITION, errorInfo.message.c_str()};
@@ -234,6 +239,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
{
std::pair<Response, grpc::Status> result = handler_(context);
setIsUnlimited(result.first, isUnlimited);
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::success);
span.setOk();
responder_.Finish(result.first, result.second, this);
}
@@ -241,6 +247,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
}
catch (std::exception const& ex)
{
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
span.recordException(ex);
grpc::Status const status{grpc::StatusCode::INTERNAL, ex.what()};
responder_.FinishWithError(status, this);

View File

@@ -9,13 +9,16 @@
* Span hierarchy:
*
* +-------------------------------------------------------+
* | grpc.request |
* | grpc.<MethodName> (e.g. grpc.GetLedger) |
* | CallData::process(coro) |
* | attrs: method, grpc_role, grpc_status |
* +-------------------------------------------------------+
*
* Unlike the HTTP/WS RPC path, gRPC has a flat single-span structure
* per request since each CallData handles exactly one RPC method.
* The method name is embedded in the span name (rather than only as
* an attribute) so dashboards can break out per-method latency and
* error rates without needing TraceQL attribute filters.
*/
#include <xrpl/telemetry/SpanNames.h>
@@ -25,16 +28,11 @@ namespace xrpl::telemetry::grpc_span {
// ===== Span prefixes =======================================================
namespace prefix {
/// "grpc" — root prefix for gRPC transport spans.
/// "grpc" — root prefix for gRPC transport spans. The full span name is
/// formed at the call site as `grpc.<MethodName>` (see GRPCServer.cpp).
inline constexpr auto grpc = makeStr("grpc");
} // namespace prefix
// ===== Span operation suffixes =============================================
namespace op {
inline constexpr auto request = makeStr("request");
} // namespace op
// ===== Attribute keys ======================================================
namespace attr {
@@ -51,6 +49,8 @@ inline constexpr auto grpcStatus = makeStr("grpc_status");
namespace val {
using telemetry::attr_val::error;
using telemetry::attr_val::success;
inline constexpr auto admin = makeStr("admin");
inline constexpr auto user = makeStr("user");
inline constexpr auto resourceExhausted = makeStr("resource_exhausted");
inline constexpr auto failedPrecondition = makeStr("failed_precondition");
} // namespace val

View File

@@ -185,7 +185,12 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object&
JLOG(context.j.debug()) << "RPC call " << name << " completed in "
<< ((end - start).count() / 1000000000.0) << "seconds";
perfLog.rpcFinish(name, curId);
span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success);
// Status::operator bool() returns true when there IS an error
// (code_ != OK), so the ternary correctly maps error->error, ok->success.
span.setAttribute(
rpc_span::attr::rpcStatus, ret ? rpc_span::val::error : rpc_span::val::success);
if (!ret)
span.setOk();
return ret;
}
catch (std::exception& e)
@@ -224,8 +229,11 @@ doCommand(RPC::JsonContext& context, Json::Value& result)
{
cmdName = "unknown";
}
auto span = SpanGuard::span(
TraceCategory::Rpc, rpc_span::prefix::command, rpc_span::val::unknownCommand);
// Use the resolved command name as the span suffix so dashboards
// can break out per-command error rates (e.g. rpc.command.submit
// for a submit that hit rpcTOO_BUSY). Falling back to a single
// "unknown" name only when the request truly omits both fields.
auto span = SpanGuard::span(TraceCategory::Rpc, rpc_span::prefix::command, cmdName);
span.setAttribute(rpc_span::attr::command, cmdName.c_str());
span.setError(get_error_info(error).token.c_str());

View File

@@ -86,7 +86,7 @@
* gRPC path (see GrpcSpanNames.h for constants):
*
* +-------------------------------------------------------+
* | grpc.request |
* | grpc.<MethodName> (e.g. grpc.GetLedger) |
* | CallData::process(coro) |
* | attrs: method, grpc_status |
* +-------------------------------------------------------+

View File

@@ -564,6 +564,7 @@ ServerHandler::processSession(
jr[jss::api_version] = jv[jss::api_version];
jr[jss::type] = jss::response;
span.setOk();
return jr;
}
@@ -598,6 +599,7 @@ ServerHandler::processSession(
{
session->close(true);
}
span.setOk();
}
static Json::Value
@@ -1036,6 +1038,7 @@ ServerHandler::processRequest(
}
}
span.setOk();
HTTPReply(httpStatus, response, output, rpcJ);
}