mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-02 16:26:48 +00:00
Merge branch 'pratik/otel-phase1c-rpc-integration' into pratik/otel-phase2-rpc-tracing
This commit is contained in:
@@ -36,6 +36,7 @@
|
||||
#include <opentelemetry/trace/tracer.h>
|
||||
|
||||
#include <string>
|
||||
#include <typeinfo>
|
||||
#include <utility>
|
||||
|
||||
namespace xrpl {
|
||||
@@ -332,7 +333,7 @@ SpanGuard::recordException(std::exception const& e)
|
||||
return;
|
||||
impl_->span->AddEvent(
|
||||
"exception",
|
||||
{{"exception.type", "std::exception"}, {"exception.message", std::string(e.what())}});
|
||||
{{"exception.type", typeid(e).name()}, {"exception.message", std::string(e.what())}});
|
||||
impl_->span->SetStatus(otel_trace::StatusCode::kError, e.what());
|
||||
}
|
||||
|
||||
|
||||
@@ -169,8 +169,7 @@ void
|
||||
GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::Coro> coro)
|
||||
{
|
||||
using namespace telemetry;
|
||||
auto span =
|
||||
SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, grpc_span::op::request);
|
||||
auto span = SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, name_);
|
||||
span.setAttribute(grpc_span::attr::method, name_);
|
||||
|
||||
try
|
||||
@@ -179,6 +178,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
|
||||
bool const isUnlimited = clientIsUnlimited();
|
||||
if (!isUnlimited && usage.disconnect(app_.getJournal("gRPCServer")))
|
||||
{
|
||||
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
|
||||
span.setError(grpc_span::val::resourceExhausted);
|
||||
grpc::Status const status{
|
||||
grpc::StatusCode::RESOURCE_EXHAUSTED, "usage balance exceeds threshold"};
|
||||
@@ -190,6 +190,10 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
|
||||
usage.charge(loadType);
|
||||
auto role = getRole(isUnlimited);
|
||||
|
||||
span.setAttribute(
|
||||
grpc_span::attr::grpcRole,
|
||||
role == Role::ADMIN ? grpc_span::val::admin : grpc_span::val::user);
|
||||
|
||||
{
|
||||
std::stringstream toLog;
|
||||
toLog << "role = " << (int)role;
|
||||
@@ -225,6 +229,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
|
||||
if (conditionMetRes != rpcSUCCESS)
|
||||
{
|
||||
RPC::ErrorInfo const errorInfo = RPC::get_error_info(conditionMetRes);
|
||||
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
|
||||
span.setError(errorInfo.token.c_str());
|
||||
grpc::Status const status{
|
||||
grpc::StatusCode::FAILED_PRECONDITION, errorInfo.message.c_str()};
|
||||
@@ -234,6 +239,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
|
||||
{
|
||||
std::pair<Response, grpc::Status> result = handler_(context);
|
||||
setIsUnlimited(result.first, isUnlimited);
|
||||
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::success);
|
||||
span.setOk();
|
||||
responder_.Finish(result.first, result.second, this);
|
||||
}
|
||||
@@ -241,6 +247,7 @@ GRPCServerImpl::CallData<Request, Response>::process(std::shared_ptr<JobQueue::C
|
||||
}
|
||||
catch (std::exception const& ex)
|
||||
{
|
||||
span.setAttribute(grpc_span::attr::grpcStatus, grpc_span::val::error);
|
||||
span.recordException(ex);
|
||||
grpc::Status const status{grpc::StatusCode::INTERNAL, ex.what()};
|
||||
responder_.FinishWithError(status, this);
|
||||
|
||||
@@ -9,13 +9,16 @@
|
||||
* Span hierarchy:
|
||||
*
|
||||
* +-------------------------------------------------------+
|
||||
* | grpc.request |
|
||||
* | grpc.<MethodName> (e.g. grpc.GetLedger) |
|
||||
* | CallData::process(coro) |
|
||||
* | attrs: method, grpc_role, grpc_status |
|
||||
* +-------------------------------------------------------+
|
||||
*
|
||||
* Unlike the HTTP/WS RPC path, gRPC has a flat single-span structure
|
||||
* per request since each CallData handles exactly one RPC method.
|
||||
* The method name is embedded in the span name (rather than only as
|
||||
* an attribute) so dashboards can break out per-method latency and
|
||||
* error rates without needing TraceQL attribute filters.
|
||||
*/
|
||||
|
||||
#include <xrpl/telemetry/SpanNames.h>
|
||||
@@ -25,16 +28,11 @@ namespace xrpl::telemetry::grpc_span {
|
||||
// ===== Span prefixes =======================================================
|
||||
|
||||
namespace prefix {
|
||||
/// "grpc" — root prefix for gRPC transport spans.
|
||||
/// "grpc" — root prefix for gRPC transport spans. The full span name is
|
||||
/// formed at the call site as `grpc.<MethodName>` (see GRPCServer.cpp).
|
||||
inline constexpr auto grpc = makeStr("grpc");
|
||||
} // namespace prefix
|
||||
|
||||
// ===== Span operation suffixes =============================================
|
||||
|
||||
namespace op {
|
||||
inline constexpr auto request = makeStr("request");
|
||||
} // namespace op
|
||||
|
||||
// ===== Attribute keys ======================================================
|
||||
|
||||
namespace attr {
|
||||
@@ -51,6 +49,8 @@ inline constexpr auto grpcStatus = makeStr("grpc_status");
|
||||
namespace val {
|
||||
using telemetry::attr_val::error;
|
||||
using telemetry::attr_val::success;
|
||||
inline constexpr auto admin = makeStr("admin");
|
||||
inline constexpr auto user = makeStr("user");
|
||||
inline constexpr auto resourceExhausted = makeStr("resource_exhausted");
|
||||
inline constexpr auto failedPrecondition = makeStr("failed_precondition");
|
||||
} // namespace val
|
||||
|
||||
@@ -185,7 +185,12 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object&
|
||||
JLOG(context.j.debug()) << "RPC call " << name << " completed in "
|
||||
<< ((end - start).count() / 1000000000.0) << "seconds";
|
||||
perfLog.rpcFinish(name, curId);
|
||||
span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success);
|
||||
// Status::operator bool() returns true when there IS an error
|
||||
// (code_ != OK), so the ternary correctly maps error->error, ok->success.
|
||||
span.setAttribute(
|
||||
rpc_span::attr::rpcStatus, ret ? rpc_span::val::error : rpc_span::val::success);
|
||||
if (!ret)
|
||||
span.setOk();
|
||||
return ret;
|
||||
}
|
||||
catch (std::exception& e)
|
||||
@@ -224,8 +229,11 @@ doCommand(RPC::JsonContext& context, Json::Value& result)
|
||||
{
|
||||
cmdName = "unknown";
|
||||
}
|
||||
auto span = SpanGuard::span(
|
||||
TraceCategory::Rpc, rpc_span::prefix::command, rpc_span::val::unknownCommand);
|
||||
// Use the resolved command name as the span suffix so dashboards
|
||||
// can break out per-command error rates (e.g. rpc.command.submit
|
||||
// for a submit that hit rpcTOO_BUSY). Falling back to a single
|
||||
// "unknown" name only when the request truly omits both fields.
|
||||
auto span = SpanGuard::span(TraceCategory::Rpc, rpc_span::prefix::command, cmdName);
|
||||
span.setAttribute(rpc_span::attr::command, cmdName.c_str());
|
||||
span.setError(get_error_info(error).token.c_str());
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@
|
||||
* gRPC path (see GrpcSpanNames.h for constants):
|
||||
*
|
||||
* +-------------------------------------------------------+
|
||||
* | grpc.request |
|
||||
* | grpc.<MethodName> (e.g. grpc.GetLedger) |
|
||||
* | CallData::process(coro) |
|
||||
* | attrs: method, grpc_status |
|
||||
* +-------------------------------------------------------+
|
||||
|
||||
@@ -564,6 +564,7 @@ ServerHandler::processSession(
|
||||
jr[jss::api_version] = jv[jss::api_version];
|
||||
|
||||
jr[jss::type] = jss::response;
|
||||
span.setOk();
|
||||
return jr;
|
||||
}
|
||||
|
||||
@@ -598,6 +599,7 @@ ServerHandler::processSession(
|
||||
{
|
||||
session->close(true);
|
||||
}
|
||||
span.setOk();
|
||||
}
|
||||
|
||||
static Json::Value
|
||||
@@ -1036,6 +1038,7 @@ ServerHandler::processRequest(
|
||||
}
|
||||
}
|
||||
|
||||
span.setOk();
|
||||
HTTPReply(httpStatus, response, output, rpcJ);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user