From a104140a517c6c43617a4ab3bc30a0e6d3e24f92 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 27 May 2026 16:46:35 +0100 Subject: [PATCH 1/6] addressing code review comments Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- docs/build/telemetry.md | 3 +- include/xrpl/protocol/detail/features.macro | 1 - include/xrpl/telemetry/Telemetry.h | 33 +++++++++++---------- src/libxrpl/telemetry/SpanGuard.cpp | 3 +- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/docs/build/telemetry.md b/docs/build/telemetry.md index e8d7fb2dd5..eff1a6848b 100644 --- a/docs/build/telemetry.md +++ b/docs/build/telemetry.md @@ -36,7 +36,8 @@ such as Grafana Tempo. Telemetry is **off by default** at both compile time and runtime: - **Compile time**: The Conan option `telemetry` and CMake option `telemetry` must be set to `True`/`ON`. - When disabled, all tracing macros compile to `((void)0)` with zero overhead. + When disabled, all `SpanGuard` calls compile to inline no-ops (defined in `SpanGuard.h`) + with zero overhead — no OTel SDK dependency required. - **Runtime**: The `[telemetry]` config section must set `enabled=1`. When disabled at runtime, a no-op implementation is used. diff --git a/include/xrpl/protocol/detail/features.macro b/include/xrpl/protocol/detail/features.macro index bad43dd6ed..494b3fa6cd 100644 --- a/include/xrpl/protocol/detail/features.macro +++ b/include/xrpl/protocol/detail/features.macro @@ -15,7 +15,6 @@ // Add new amendments to the top of this list. // Keep it sorted in reverse chronological order. -XRPL_FIX (Cleanup3_2_0, Supported::no, VoteBehavior::DefaultNo) XRPL_FEATURE(MPTokensV2, Supported::no, VoteBehavior::DefaultNo) XRPL_FIX (Security3_1_3, Supported::no, VoteBehavior::DefaultNo) XRPL_FIX (PermissionedDomainInvariant, Supported::yes, VoteBehavior::DefaultNo) diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index 1d69e01a43..ae9303a1db 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -35,32 +35,33 @@ Usage examples: - 1. Check before tracing (typical guard pattern): + 1. Root span at a subsystem entry point (typical usage): @code - auto& telemetry = registry.getTelemetry(); - if (telemetry.isEnabled() && telemetry.shouldTraceRpc()) - { - auto span = telemetry.startSpan("rpc.command.server_info"); - // ... do work, span ends when shared_ptr refcount drops to 0 - } + // In an RPC handler dispatch: + auto guard = SpanGuard::span(TraceCategory::Rpc, "rpc", commandName); + guard.setAttribute("xrpl.rpc.command", commandName); + // ... process request + // guard destructor automatically ends the span on scope exit @endcode - 2. RAII tracing with SpanGuard (preferred): + 2. Child span for a sub-operation (cross-scope): @code - if (telemetry.isEnabled() && telemetry.shouldTraceRpc()) + auto parent = SpanGuard::span(TraceCategory::Transactions, "tx", "process"); { - SpanGuard guard(telemetry.startSpan("rpc.command.submit")); - guard.setAttribute("xrpl.rpc.command", "submit"); - // ... guard ends span automatically on scope exit + auto child = parent.childSpan("tx.apply"); + child.setAttribute("xrpl.tx.type", txType); + // child ends here } + // parent continues, then ends here @endcode 3. Cross-thread context propagation: @code - // On thread A: capture context - auto ctx = guard.context(); - // On thread B: create child span with explicit parent - auto child = telemetry.startSpan("async.work", ctx); + // Thread A: capture the active context while span is in scope + auto ctx = parentGuard.captureContext(); + + // Thread B: create child span with explicit parent + auto child = SpanGuard::childSpan("async.work", ctx); @endcode @note Thread safety: The Telemetry interface is safe for concurrent reads diff --git a/src/libxrpl/telemetry/SpanGuard.cpp b/src/libxrpl/telemetry/SpanGuard.cpp index 11eda56ed6..90bb139b97 100644 --- a/src/libxrpl/telemetry/SpanGuard.cpp +++ b/src/libxrpl/telemetry/SpanGuard.cpp @@ -35,6 +35,7 @@ #include #include +#include #include namespace xrpl { @@ -327,7 +328,7 @@ SpanGuard::recordException(std::exception const& e) return; impl_->span->AddEvent( "exception", - {{"exception.type", "std::exception"}, {"exception.message", std::string(e.what())}}); + {{"exception.type", typeid(e).name()}, {"exception.message", std::string(e.what())}}); impl_->span->SetStatus(otel_trace::StatusCode::kError, e.what()); } From 6aa8570d6ce52cb6d15cbe346fc6a7e1a0d585e1 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 27 May 2026 17:36:06 +0100 Subject: [PATCH 2/6] addressed code review comments. Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- include/xrpl/telemetry/SpanNames.h | 1 + include/xrpl/telemetry/Telemetry.h | 35 ++++++++++++++------------ src/xrpld/app/main/GRPCServer.cpp | 8 ++++++ src/xrpld/app/main/GrpcSpanNames.h | 2 ++ src/xrpld/rpc/detail/RPCHandler.cpp | 7 +++++- src/xrpld/rpc/detail/ServerHandler.cpp | 3 +++ 6 files changed, 39 insertions(+), 17 deletions(-) diff --git a/include/xrpl/telemetry/SpanNames.h b/include/xrpl/telemetry/SpanNames.h index 3c8b6144f9..9fdc37cd64 100644 --- a/include/xrpl/telemetry/SpanNames.h +++ b/include/xrpl/telemetry/SpanNames.h @@ -20,6 +20,7 @@ * - Per-span attribute keys: bare field name (span name carries the domain). * - Collision qualifier: _ when bare name collides across * domains or with OTel reserved `status` (e.g. rpc_status, grpc_status). + * - Shared cross-span attributes: _ (underscore) form. * - Resource attribute keys: xrpl.. (process-identity). * - Span prefixes: [.]. */ diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index f648289569..0cb217bd29 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -3,13 +3,15 @@ /** Abstract interface for OpenTelemetry distributed tracing. Provides the Telemetry base class that all components use to create trace - spans. Two concrete implementations exist, selected at construction time + spans. Three concrete implementations exist, selected at construction time by make_Telemetry(): - TelemetryImpl (Telemetry.cpp): real OTel SDK integration, compiled only when XRPL_ENABLE_TELEMETRY is defined and enabled at runtime. - NullTelemetry (NullTelemetry.cpp): no-op stub used when telemetry is disabled at compile time or runtime. + - NullTelemetryOtel (Telemetry.cpp): no-op stub that still depends on + the OTel API (used during transition or for testing). Inheritance / dependency diagram: @@ -37,32 +39,33 @@ 1. Root span at a subsystem entry point (typical usage): @code + #include + using namespace xrpl::telemetry; + // In an RPC handler dispatch: - auto guard = SpanGuard::span(TraceCategory::Rpc, "rpc", commandName); - guard.setAttribute("xrpl.rpc.command", commandName); + auto guard = SpanGuard::span( + TraceCategory::Rpc, rpc_span::prefix::command, commandName); + guard.setAttribute(rpc_span::attr::command, commandName); // ... process request // guard destructor automatically ends the span on scope exit @endcode - 2. Child span for a sub-operation (cross-scope): - @code - auto parent = SpanGuard::span(TraceCategory::Transactions, "tx", "process"); - { - SpanGuard guard(telemetry.startSpan("rpc.command.submit")); - guard.setAttribute("command", "submit"); - // ... guard ends span automatically on scope exit - } - @endcode - - 3. Child span for a sub-operation (cross-scope): + 2. Child span for a sub-operation (scoped child): @code auto parent = SpanGuard::span(TraceCategory::Transactions, "tx", "process"); { auto child = parent.childSpan("tx.apply"); - child.setAttribute("xrpl.tx.type", txType); + child.setAttribute("tx_type", txType); // child ends here } - // parent continues, then ends here + @endcode + + 3. Unrelated span (cross-scope, same thread): + @code + // Transactions and RPC can be active simultaneously + auto txSpan = SpanGuard::span(TraceCategory::Transactions, "tx", "process"); + auto rpcSpan = SpanGuard::span(TraceCategory::Rpc, "rpc", "info"); + // both spans end on scope exit @endcode 4. Cross-thread context propagation: diff --git a/src/xrpld/app/main/GRPCServer.cpp b/src/xrpld/app/main/GRPCServer.cpp index eeb4797b19..92760d641f 100644 --- a/src/xrpld/app/main/GRPCServer.cpp +++ b/src/xrpld/app/main/GRPCServer.cpp @@ -179,6 +179,7 @@ GRPCServerImpl::CallData::process(std::shared_ptr::process(std::shared_ptrerror, ok->success. + span.setAttribute( + rpc_span::attr::rpcStatus, ret ? rpc_span::val::error : rpc_span::val::success); + if (!ret) + span.setOk(); return ret; } catch (std::exception& e) diff --git a/src/xrpld/rpc/detail/ServerHandler.cpp b/src/xrpld/rpc/detail/ServerHandler.cpp index a181ec56cd..4f1918076c 100644 --- a/src/xrpld/rpc/detail/ServerHandler.cpp +++ b/src/xrpld/rpc/detail/ServerHandler.cpp @@ -564,6 +564,7 @@ ServerHandler::processSession( jr[jss::api_version] = jv[jss::api_version]; jr[jss::type] = jss::response; + span.setOk(); return jr; } @@ -598,6 +599,7 @@ ServerHandler::processSession( { session->close(true); } + span.setOk(); } static Json::Value @@ -1036,6 +1038,7 @@ ServerHandler::processRequest( } } + span.setOk(); HTTPReply(httpStatus, response, output, rpcJ); } From f6f0cb1a5fd978cf074971a5456af08d243d0eff Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 27 May 2026 17:55:09 +0100 Subject: [PATCH 3/6] updated class comment Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- include/xrpl/telemetry/SpanNames.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/xrpl/telemetry/SpanNames.h b/include/xrpl/telemetry/SpanNames.h index 9fdc37cd64..fe12e3bdc5 100644 --- a/include/xrpl/telemetry/SpanNames.h +++ b/include/xrpl/telemetry/SpanNames.h @@ -20,8 +20,13 @@ * - Per-span attribute keys: bare field name (span name carries the domain). * - Collision qualifier: _ when bare name collides across * domains or with OTel reserved `status` (e.g. rpc_status, grpc_status). - * - Shared cross-span attributes: _ (underscore) form. - * - Resource attribute keys: xrpl.. (process-identity). + * - Shared cross-span attributes: _ (underscore) form + * (e.g. tx_hash, peer_id, ledger_seq, consensus_round). + * - Resource attribute keys: xrpl.. (dotted) form is + * RESERVED for process-identity attributes set once at startup on the + * OTel resource (e.g. xrpl.network.id, xrpl.network.type). Do not use + * this form for span attributes — it parses awkwardly in TraceQL and + * blurs the resource/span scope distinction. * - Span prefixes: [.]. */ From bfdcd3da87c5b629dd720ed3b1ebb256dad69e0e Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 27 May 2026 18:26:29 +0100 Subject: [PATCH 4/6] fix(telemetry): use resolved command/method name as span suffix Per PR #6438 review thread r3250432621: known-command errors (rpcTOO_BUSY, rpcNO_PERMISSION, etc.) were collapsing into a single rpc.command.unknown span name, hiding per-command error rates in dashboards. Same anti-pattern existed for gRPC, where every method was bucketed under grpc.request with the method relegated to an attribute. - RPCHandler.cpp: doCommand error path uses cmdName as the span suffix; the rpc_span::val::unknownCommand fallback only applies when the request truly omits both command and method fields. - GRPCServer.cpp: gRPC span name is now grpc. (e.g. grpc.GetLedger). Method also retained as an attribute. - GrpcSpanNames.h: drop the unused op::request constant; update the span-hierarchy comment. - RpcSpanNames.h: update the gRPC span diagram to match. Dashboards on downstream phases will benefit from per-command breakdowns without needing TraceQL attribute filters. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/xrpld/app/main/GRPCServer.cpp | 3 +-- src/xrpld/app/main/GrpcSpanNames.h | 14 ++++++-------- src/xrpld/rpc/detail/RPCHandler.cpp | 7 +++++-- src/xrpld/rpc/detail/RpcSpanNames.h | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/xrpld/app/main/GRPCServer.cpp b/src/xrpld/app/main/GRPCServer.cpp index 92760d641f..d0019b79f9 100644 --- a/src/xrpld/app/main/GRPCServer.cpp +++ b/src/xrpld/app/main/GRPCServer.cpp @@ -169,8 +169,7 @@ void GRPCServerImpl::CallData::process(std::shared_ptr coro) { using namespace telemetry; - auto span = - SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, grpc_span::op::request); + auto span = SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, name_); span.setAttribute(grpc_span::attr::method, name_); try diff --git a/src/xrpld/app/main/GrpcSpanNames.h b/src/xrpld/app/main/GrpcSpanNames.h index c2f40ee53d..2532b3812d 100644 --- a/src/xrpld/app/main/GrpcSpanNames.h +++ b/src/xrpld/app/main/GrpcSpanNames.h @@ -9,13 +9,16 @@ * Span hierarchy: * * +-------------------------------------------------------+ - * | grpc.request | + * | grpc. (e.g. grpc.GetLedger) | * | CallData::process(coro) | * | attrs: method, grpc_role, grpc_status | * +-------------------------------------------------------+ * * Unlike the HTTP/WS RPC path, gRPC has a flat single-span structure * per request since each CallData handles exactly one RPC method. + * The method name is embedded in the span name (rather than only as + * an attribute) so dashboards can break out per-method latency and + * error rates without needing TraceQL attribute filters. */ #include @@ -25,16 +28,11 @@ namespace xrpl::telemetry::grpc_span { // ===== Span prefixes ======================================================= namespace prefix { -/// "grpc" — root prefix for gRPC transport spans. +/// "grpc" — root prefix for gRPC transport spans. The full span name is +/// formed at the call site as `grpc.` (see GRPCServer.cpp). inline constexpr auto grpc = makeStr("grpc"); } // namespace prefix -// ===== Span operation suffixes ============================================= - -namespace op { -inline constexpr auto request = makeStr("request"); -} // namespace op - // ===== Attribute keys ====================================================== namespace attr { diff --git a/src/xrpld/rpc/detail/RPCHandler.cpp b/src/xrpld/rpc/detail/RPCHandler.cpp index e8b6e6d26c..1eca97cd52 100644 --- a/src/xrpld/rpc/detail/RPCHandler.cpp +++ b/src/xrpld/rpc/detail/RPCHandler.cpp @@ -229,8 +229,11 @@ doCommand(RPC::JsonContext& context, Json::Value& result) { cmdName = "unknown"; } - auto span = SpanGuard::span( - TraceCategory::Rpc, rpc_span::prefix::command, rpc_span::val::unknownCommand); + // Use the resolved command name as the span suffix so dashboards + // can break out per-command error rates (e.g. rpc.command.submit + // for a submit that hit rpcTOO_BUSY). Falling back to a single + // "unknown" name only when the request truly omits both fields. + auto span = SpanGuard::span(TraceCategory::Rpc, rpc_span::prefix::command, cmdName); span.setAttribute(rpc_span::attr::command, cmdName.c_str()); span.setError(get_error_info(error).token.c_str()); diff --git a/src/xrpld/rpc/detail/RpcSpanNames.h b/src/xrpld/rpc/detail/RpcSpanNames.h index c99c153f11..62eba3de71 100644 --- a/src/xrpld/rpc/detail/RpcSpanNames.h +++ b/src/xrpld/rpc/detail/RpcSpanNames.h @@ -86,9 +86,9 @@ * gRPC path (see GrpcSpanNames.h for constants): * * +-------------------------------------------------------+ - * | grpc.request | + * | grpc. (e.g. grpc.GetLedger) | * | CallData::process(coro) | - * | attrs: method, grpc_status | + * | attrs: method, grpc_status | * +-------------------------------------------------------+ * * Covered paths: From 9498b2865fff9dce482f72b11b5837ad8cfde13c Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 28 May 2026 11:27:29 +0100 Subject: [PATCH 5/6] fix(telemetry): address PR #6424 review comments - Drop xrpl.node.amendment_blocked / xrpl.node.server_state from telemetry surface (constants in SpanNames.h, two filters in tempo.yaml). Operators read the same data via server_info / server_state RPC; OTel SDK 1.18.0 cannot refresh resource attrs at runtime so resource-level emission was not viable either. - Namespace all pathfind span attributes under pathfind_* (underscore form per Phase 1c rule 5). Renames in PathFindSpanNames.h and call sites in PathRequest.cpp, PathRequestManager.cpp, plus the rule-5 retention xrpl.pathfind.ledger_index -> pathfind_ledger_index. - Wire pathfind_source_account / pathfind_dest_account on pathfind.request in doPathFind / doRipplePathFind handlers (only when present + string). - Collapse per-asset pathfind.discover / pathfind.rank spans into one pathfind.discover hoisted around the per-source-asset loop in PathRequest::findPaths. Span count goes from 2N to 1 per RPC call; per-asset breakdown traded for bounded storage and cardinality. Trade-off documented inline. - Fix pathfind_num_paths semantics: now sums getBestPaths().size() across the loop (paths actually returned) instead of the maxPaths input cap. - PathRequestManager::updateAll: move span creation after the locked requests_ snapshot, early-return when no active subscriptions exist (avoids empty span on every ledger close), set pathfind_num_requests = requests.size(). - Update Phase2_taskList.md and 02-design-decisions.md to match. Co-Authored-By: Claude Opus 4.7 (1M context) --- OpenTelemetryPlan/02-design-decisions.md | 28 +++--- OpenTelemetryPlan/Phase2_taskList.md | 57 +++--------- .../provisioning/datasources/tempo.yaml | 11 --- include/xrpl/telemetry/SpanNames.h | 9 -- src/xrpld/rpc/detail/PathFindSpanNames.h | 91 ++++++++++--------- src/xrpld/rpc/detail/PathRequest.cpp | 18 ++++ src/xrpld/rpc/detail/PathRequestManager.cpp | 17 +++- src/xrpld/rpc/detail/Pathfinder.cpp | 12 --- src/xrpld/rpc/handlers/orderbook/PathFind.cpp | 7 +- .../rpc/handlers/orderbook/RipplePathFind.cpp | 7 +- 10 files changed, 116 insertions(+), 141 deletions(-) diff --git a/OpenTelemetryPlan/02-design-decisions.md b/OpenTelemetryPlan/02-design-decisions.md index 6523a53d09..7ced4aaf2b 100644 --- a/OpenTelemetryPlan/02-design-decisions.md +++ b/OpenTelemetryPlan/02-design-decisions.md @@ -346,20 +346,20 @@ resource::SemanticConventions::SERVICE_INSTANCE_ID = The following table summarizes what data is collected by category: -| Category | Attributes Collected | Purpose | -| --------------- | ---------------------------------------------------------------------- | ---------------------------- | -| **Transaction** | `tx.hash`, `tx.type`, `tx.result`, `tx.fee`, `ledger_index` | Trace transaction lifecycle | -| **Consensus** | `round`, `phase`, `mode`, `proposers` (public keys), `duration_ms` | Analyze consensus timing | -| **RPC** | `command`, `version`, `status`, `duration_ms` | Monitor RPC performance | -| **Peer** | `peer.id` (public key), `latency_ms`, `message.type`, `message.size` | Network topology analysis | -| **Ledger** | `ledger.hash`, `ledger.index`, `close_time`, `tx_count` | Ledger progression tracking | -| **Job** | `job.type`, `queue_ms`, `worker` | JobQueue performance | -| **PathFinding** | `pathfind.source_currency`, `dest_currency`, `path_count`, `cache_hit` | Payment path analysis | -| **TxQ** | `txq.queue_depth`, `fee_level`, `eviction_reason` | Queue depth and fee tracking | -| **Fee** | `fee.load_factor`, `escalation_level` | Fee escalation monitoring | -| **Validator** | `validator.list_size`, `list_age_sec` | UNL health monitoring | -| **Amendment** | `amendment.name`, `status` | Protocol upgrade tracking | -| **SHAMap** | `shamap.type`, `missing_nodes`, `duration_ms` | State tree sync performance | +| Category | Attributes Collected | Purpose | +| --------------- | ---------------------------------------------------------------------------------------------------------------- | ---------------------------- | +| **Transaction** | `tx.hash`, `tx.type`, `tx.result`, `tx.fee`, `ledger_index` | Trace transaction lifecycle | +| **Consensus** | `round`, `phase`, `mode`, `proposers` (public keys), `duration_ms` | Analyze consensus timing | +| **RPC** | `command`, `version`, `status`, `duration_ms` | Monitor RPC performance | +| **Peer** | `peer.id` (public key), `latency_ms`, `message.type`, `message.size` | Network topology analysis | +| **Ledger** | `ledger.hash`, `ledger.index`, `close_time`, `tx_count` | Ledger progression tracking | +| **Job** | `job.type`, `queue_ms`, `worker` | JobQueue performance | +| **PathFinding** | `pathfind_fast`, `pathfind_search_level`, `pathfind_num_paths`, `pathfind_ledger_index`, `pathfind_num_requests` | Payment path analysis | +| **TxQ** | `txq.queue_depth`, `fee_level`, `eviction_reason` | Queue depth and fee tracking | +| **Fee** | `fee.load_factor`, `escalation_level` | Fee escalation monitoring | +| **Validator** | `validator.list_size`, `list_age_sec` | UNL health monitoring | +| **Amendment** | `amendment.name`, `status` | Protocol upgrade tracking | +| **SHAMap** | `shamap.type`, `missing_nodes`, `duration_ms` | State tree sync performance | ### 2.4.4 Privacy & Sensitive Data Policy diff --git a/OpenTelemetryPlan/Phase2_taskList.md b/OpenTelemetryPlan/Phase2_taskList.md index 6979f78869..6b64210e6b 100644 --- a/OpenTelemetryPlan/Phase2_taskList.md +++ b/OpenTelemetryPlan/Phase2_taskList.md @@ -1,8 +1,8 @@ # Phase 2: RPC Tracing Completion Task List -> **Goal**: Complete RPC tracing coverage with unit tests, Grafana search filters, node health attributes, and config hardening. Build on the Phase 1c SpanGuard factory foundation to achieve production-quality RPC observability. +> **Goal**: Complete RPC tracing coverage with unit tests, Grafana search filters, PathFind instrumentation, and config hardening. Build on the Phase 1c SpanGuard factory foundation to achieve production-quality RPC observability. > -> **Scope**: Unit tests for core telemetry, Grafana Tempo search filters, node health span attributes, config validation (`std::clamp`). +> **Scope**: Unit tests for core telemetry, Grafana Tempo search filters, PathFind RPC tracing, config validation (`std::clamp`). > > **Branch**: `pratik/otel-phase2-rpc-tracing` (from `pratik/otel-phase1c-rpc-integration`) @@ -121,42 +121,9 @@ These can be added later if dashboard queries specifically need them. The node h ## Task 2.8: RPC Span Attribute Enrichment — Node Health Context -> **Source**: [External Dashboard Parity](../docs/superpowers/specs/2026-03-30-external-dashboard-parity-design.md) — adds node-level health context inspired by the community [xrpl-validator-dashboard](https://github.com/realgrapedrop/xrpl-validator-dashboard). -> -> **Downstream**: Phase 7 (MetricsRegistry uses these attributes for alerting context), Phase 10 (validation checks for these attributes). +**Status**: DROPPED. -**Objective**: Add node-level health state to every `rpc.command.*` span so operators can correlate RPC behavior with node state in Tempo. - -**What to do**: - -- Edit `src/xrpld/rpc/detail/RPCHandler.cpp`: - - In the `rpc.command.*` span creation block (after existing `setAttribute` calls for `command`, `version`, etc.): - - Node health attrs (`xrpl.node.amendment_blocked`, `xrpl.node.server_state`) are now resource-level attrs, not per-span. They are set at Tracer init. - -**New span attributes**: - -| Attribute | Type | Source | Example | -| ----------------------------- | ------ | ------------------------------------------- | -------- | -| `xrpl.node.amendment_blocked` | bool | `context.app.getOPs().isAmendmentBlocked()` | `true` | -| `xrpl.node.server_state` | string | `context.app.getOPs().strOperatingMode()` | `"full"` | - -**Rationale**: When a node is amendment-blocked or in a degraded state, every RPC response is suspect. Tagging spans with this state enables Tempo TraceQL queries like: - -``` -{name=~"rpc.command.*"} | xrpl.node.amendment_blocked = true -``` - -This surfaces all RPCs served during a blocked period — critical for post-incident analysis. - -**Key modified files**: - -- `src/xrpld/rpc/detail/RPCHandler.cpp` - -**Exit Criteria**: - -- [ ] `rpc.command.server_info` spans carry `xrpl.node.amendment_blocked` and `xrpl.node.server_state` attributes -- [ ] No measurable latency impact (attribute values are cached atomics, not computed per-call) -- [ ] Attributes appear in Tempo trace detail view +Node health (`amendment_blocked`, `server_state`) is not part of the telemetry surface. Operators consume the same data via the existing `server_info` / `server_state` RPC commands, so duplicating it on traces adds storage and cardinality cost without new value. The OTel C++ SDK 1.18.0 also does not support runtime updates to the resource, ruling out resource-level emission of these dynamic-by-nature flags. --- @@ -169,10 +136,11 @@ This surfaces all RPCs served during a blocked period — critical for post-inci **Spans added**: - `pathfind.request` — wraps `doPathFind()` and `doRipplePathFind()` RPC handlers -- `pathfind.compute` — wraps `PathRequest::doUpdate()` (fast/normal attr) -- `pathfind.update_all` — wraps `PathRequestManager::updateAll()` on ledger close (ledger_index attr) -- `pathfind.discover` — wraps `Pathfinder::findPaths()` graph exploration (search_level attr) -- `pathfind.rank` — wraps `Pathfinder::computePathRanks()` liquidity validation (num_paths attr) +- `pathfind.compute` — wraps `PathRequest::doUpdate()` (`pathfind_fast` attr) +- `pathfind.update_all` — wraps `PathRequestManager::updateAll()` on ledger close (`pathfind_ledger_index`, `pathfind_num_requests` attrs; emitted only when active subscriptions exist) +- `pathfind.discover` — wraps the entire per-source-asset loop in `PathRequest::findPaths()` (`pathfind_search_level`, `pathfind_num_paths` attrs). One span per RPC call instead of N (one per source asset). Trade-off: per-asset breakdown is lost; storage and cardinality bounded. + +**Attribute namespacing**: All pathfind attributes use the `pathfind_*` underscore form per the Phase 1c naming-spec rule 5. **New file**: `src/xrpld/rpc/detail/PathFindSpanNames.h` @@ -197,9 +165,10 @@ This surfaces all RPCs served during a blocked period — critical for post-inci | 2.5 | Enhanced RPC span attributes (HTTP-level) | Deferred | Low value; span duration covers timing natively | | 2.6 | Build verification and performance baseline | Complete | Verified in CI on Phase 1c | | 2.7 | Grafana Tempo search filters | Complete | rpc-command, rpc-status, rpc-role filters | -| 2.8 | RPC span attribute enrichment (node health) | Complete | amendment_blocked + server_state | -| 2.9 | PathFind RPC instrumentation (5 spans) | Complete | request, compute, update_all, discover, rank | +| 2.8 | RPC span attribute enrichment (node health) | Dropped | Available via `server_info`/`server_state` RPC | +| 2.9 | PathFind RPC instrumentation | Complete | request, compute, update_all, discover | -**Delivered in this branch**: Tasks 2.4, 2.7, 2.8, 2.9. +**Delivered in this branch**: Tasks 2.4, 2.7, 2.9. **Deferred with rationale**: Tasks 2.1 (→Phase 3), 2.5 (low priority). +**Dropped**: Task 2.8 (node health not duplicated on traces). **Superseded**: Task 2.2 (Phase 1c SpanGuard factory covers this). diff --git a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml index 7f4bd3684f..1af993961a 100644 --- a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml +++ b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml @@ -106,14 +106,3 @@ datasources: operator: "=" scope: span type: dynamic - # Phase 2: Node health filters (Task 2.8) — resource attributes - - id: node-amendment-blocked - tag: xrpl.node.amendment_blocked - operator: "=" - scope: resource - type: static - - id: node-server-state - tag: xrpl.node.server_state - operator: "=" - scope: resource - type: dynamic diff --git a/include/xrpl/telemetry/SpanNames.h b/include/xrpl/telemetry/SpanNames.h index 8ea05fe2e0..fe12e3bdc5 100644 --- a/include/xrpl/telemetry/SpanNames.h +++ b/include/xrpl/telemetry/SpanNames.h @@ -108,15 +108,6 @@ namespace attr { inline constexpr auto networkId = join(join(seg::xrpl, seg::network), makeStr("id")); inline constexpr auto networkType = join(join(seg::xrpl, seg::network), makeStr("type")); inline constexpr auto linkType = makeStr("link_type"); - -/// Node health attributes — RESOURCE-ONLY (process identity, not per-span). -/// Set at Tracer init via resource::Resource::Create and refreshed on state -/// transitions. Do NOT use with span.setAttribute(). -inline constexpr auto xrplNode = join(seg::xrpl, makeStr("node")); -/// "xrpl.node.amendment_blocked" — resource attribute key. -inline constexpr auto nodeAmendmentBlocked = join(xrplNode, makeStr("amendment_blocked")); -/// "xrpl.node.server_state" — resource attribute key. -inline constexpr auto nodeServerState = join(xrplNode, makeStr("server_state")); } // namespace attr // ===== Shared attribute values ============================================= diff --git a/src/xrpld/rpc/detail/PathFindSpanNames.h b/src/xrpld/rpc/detail/PathFindSpanNames.h index af61340b6f..17a1c12858 100644 --- a/src/xrpld/rpc/detail/PathFindSpanNames.h +++ b/src/xrpld/rpc/detail/PathFindSpanNames.h @@ -9,34 +9,36 @@ * * RPC entry (one-shot or subscription): * - * +-------------------------------------------------------+ - * | pathfind.request | - * | doPathFind() / doRipplePathFind() | - * | attrs: source_account, dest_account | - * | | - * | +--------------------------------------------------+ | - * | | pathfind.compute | | - * | | PathRequest::doUpdate() | | - * | | attrs: fast, search_level | | - * | | | | - * | | +---------------------+ +--------------------+ | | - * | | | pathfind.discover | | pathfind.rank | | | - * | | | Pathfinder::find() | | computePathRanks() | | | - * | | +---------------------+ +--------------------+ | | - * | +--------------------------------------------------+ | - * +-------------------------------------------------------+ + * +----------------------------------------------------------------+ + * | pathfind.request | + * | doPathFind() / doRipplePathFind() | + * | attrs: pathfind_source_account, pathfind_dest_account | + * | (set when present in request params) | + * | | + * | +-----------------------------------------------------------+ | + * | | pathfind.compute | | + * | | PathRequest::doUpdate() | | + * | | attrs: pathfind_fast | | + * | | | | + * | | +-----------------------------------------------------+ | | + * | | | pathfind.discover (one per RPC call, hoisted above | | + * | | | the per-source-asset loop in PathRequest::findPaths)| | + * | | | attrs: pathfind_search_level, pathfind_num_paths | | + * | | +-----------------------------------------------------+ | | + * | +-----------------------------------------------------------+ | + * +----------------------------------------------------------------+ * * Async recomputation (ledger close): * - * +-------------------------------------------------------+ - * | pathfind.update_all | - * | PathRequestManager::updateAll() | - * | attrs: ledger_index, num_requests | - * | | - * | +--------------------------------------------------+ | - * | | pathfind.compute (per active request) | | - * | +--------------------------------------------------+ | - * +-------------------------------------------------------+ + * +----------------------------------------------------------------+ + * | pathfind.update_all | + * | PathRequestManager::updateAll() | + * | attrs: pathfind_ledger_index, pathfind_num_requests | + * | | + * | +-----------------------------------------------------------+ | + * | | pathfind.compute (per active request) | | + * | +-----------------------------------------------------------+ | + * +----------------------------------------------------------------+ */ #include @@ -57,30 +59,31 @@ inline constexpr auto request = makeStr("request"); inline constexpr auto compute = makeStr("compute"); inline constexpr auto updateAll = makeStr("update_all"); inline constexpr auto discover = makeStr("discover"); -inline constexpr auto rank = makeStr("rank"); } // namespace op // ===== Attribute keys ====================================================== +// +// All pathfind attributes are namespaced under `pathfind_*` (underscore form, +// per Phase 1c naming spec rule 5). Avoids collisions with bare keys like +// `fast` or `num_paths` that other subsystems may introduce. namespace attr { -/// "source_account" — originating account for path search. -inline constexpr auto sourceAccount = makeStr("source_account"); -/// "dest_account" — destination account. -inline constexpr auto destAccount = makeStr("dest_account"); -/// "fast" — whether fast pathfinding mode enabled. -inline constexpr auto fast = makeStr("fast"); -/// "search_level" — depth of graph exploration. -inline constexpr auto searchLevel = makeStr("search_level"); -/// "num_complete_paths" — complete paths found. -inline constexpr auto numCompletePaths = makeStr("num_complete_paths"); -/// "num_paths" — total paths returned. -inline constexpr auto numPaths = makeStr("num_paths"); -/// "num_requests" — active path requests. -inline constexpr auto numRequests = makeStr("num_requests"); -/// "xrpl.pathfind.ledger_index" — kept qualified (rule 5): pathfind target -/// ledger is distinct from xrpl.ledger.seq. -inline constexpr auto ledgerIndex = - join(join(seg::xrpl, makeStr("pathfind")), makeStr("ledger_index")); +/// "pathfind_source_account" — originating account for path search. +inline constexpr auto sourceAccount = makeStr("pathfind_source_account"); +/// "pathfind_dest_account" — destination account. +inline constexpr auto destAccount = makeStr("pathfind_dest_account"); +/// "pathfind_fast" — whether fast pathfinding mode enabled. +inline constexpr auto fast = makeStr("pathfind_fast"); +/// "pathfind_search_level" — depth of graph exploration. +inline constexpr auto searchLevel = makeStr("pathfind_search_level"); +/// "pathfind_num_paths" — total paths produced across the per-source-asset +/// loop in PathRequest::findPaths (sum of getBestPaths().size() per asset). +inline constexpr auto numPaths = makeStr("pathfind_num_paths"); +/// "pathfind_num_requests" — snapshot size of requests_ at update_all start +/// (may include weak_ptrs that subsequently expire during processing). +inline constexpr auto numRequests = makeStr("pathfind_num_requests"); +/// "pathfind_ledger_index" — pathfind target ledger index. +inline constexpr auto ledgerIndex = makeStr("pathfind_ledger_index"); } // namespace attr } // namespace xrpl::telemetry::pathfind_span diff --git a/src/xrpld/rpc/detail/PathRequest.cpp b/src/xrpld/rpc/detail/PathRequest.cpp index 8148879362..7b54872cbc 100644 --- a/src/xrpld/rpc/detail/PathRequest.cpp +++ b/src/xrpld/rpc/detail/PathRequest.cpp @@ -40,6 +40,7 @@ #include #include +#include #include #include #include @@ -579,6 +580,20 @@ PathRequest::findPaths( auto const dst_amount = convertAmount(saDstAmount, convert_all_); hash_map> pathasset_map; + + // One `pathfind.discover` span wraps the entire per-source-asset loop so + // that a single RPC call produces one discover span instead of N (one per + // candidate source asset). Trade-off: per-asset discovery/ranking timing + // is no longer split into individual spans — span count and Tempo storage + // are bounded per RPC at the cost of per-asset visibility. If per-asset + // breakdown is needed in the future, add child spans inside the loop body + // (`Pathfinder::findPaths`/`computePathRanks`) parented off this span. + using namespace telemetry; + auto span = SpanGuard::span( + TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::discover); + span.setAttribute(pathfind_span::attr::searchLevel, static_cast(level)); + + std::int64_t totalPaths = 0; for (auto const& asset : sourceAssets) { if (continueCallback && !continueCallback()) @@ -598,6 +613,7 @@ PathRequest::findPaths( auto ps = pathfinder->getBestPaths( max_paths_, fullLiquidityPath, mContext[asset], asset.getIssuer(), continueCallback); mContext[asset] = ps; + totalPaths += static_cast(ps.size()); auto const& sourceAccount = [&] { if (!isXRP(asset.getIssuer())) @@ -697,6 +713,8 @@ PathRequest::findPaths( } } + span.setAttribute(pathfind_span::attr::numPaths, totalPaths); + /* The resource fee is based on the number of source currencies used. The minimum cost is 50 and the maximum is 400. The cost increases after four source currencies, 50 - (4 * 4) = 34. diff --git a/src/xrpld/rpc/detail/PathRequestManager.cpp b/src/xrpld/rpc/detail/PathRequestManager.cpp index 16e5def7c3..9577e500de 100644 --- a/src/xrpld/rpc/detail/PathRequestManager.cpp +++ b/src/xrpld/rpc/detail/PathRequestManager.cpp @@ -61,11 +61,6 @@ PathRequestManager::getAssetCache(std::shared_ptr const& ledger, void PathRequestManager::updateAll(std::shared_ptr const& inLedger) { - using namespace telemetry; - auto span = SpanGuard::span( - TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::updateAll); - span.setAttribute(pathfind_span::attr::ledgerIndex, static_cast(inLedger->seq())); - auto event = app_.getJobQueue().makeLoadEvent(jtPATH_FIND, "PathRequest::updateAll"); std::vector requests; @@ -78,6 +73,18 @@ PathRequestManager::updateAll(std::shared_ptr const& inLedger) cache = getAssetCache(inLedger, true); } + // updateAll runs on every ledger close; skip span emission entirely when + // there are no active path subscriptions to avoid a steady stream of empty + // spans at mainnet close cadence. + if (requests.empty()) + return; + + using namespace telemetry; + auto span = SpanGuard::span( + TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::updateAll); + span.setAttribute(pathfind_span::attr::ledgerIndex, static_cast(inLedger->seq())); + span.setAttribute(pathfind_span::attr::numRequests, static_cast(requests.size())); + bool newRequests = app_.getLedgerMaster().isNewPathRequest(); bool mustBreak = false; diff --git a/src/xrpld/rpc/detail/Pathfinder.cpp b/src/xrpld/rpc/detail/Pathfinder.cpp index 1f1034739e..a31d4522a7 100644 --- a/src/xrpld/rpc/detail/Pathfinder.cpp +++ b/src/xrpld/rpc/detail/Pathfinder.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include #include @@ -30,7 +29,6 @@ #include #include #include -#include #include #include @@ -229,11 +227,6 @@ Pathfinder::Pathfinder( bool Pathfinder::findPaths(int searchLevel, std::function const& continueCallback) { - using namespace telemetry; - auto span = SpanGuard::span( - TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::discover); - span.setAttribute(pathfind_span::attr::searchLevel, static_cast(searchLevel)); - JLOG(j_.trace()) << "findPaths start"; if (mDstAmount == beast::zero) { @@ -444,11 +437,6 @@ Pathfinder::getPathLiquidity( void Pathfinder::computePathRanks(int maxPaths, std::function const& continueCallback) { - using namespace telemetry; - auto span = SpanGuard::span( - TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::rank); - span.setAttribute(pathfind_span::attr::numPaths, static_cast(maxPaths)); - mRemainingAmount = convertAmount(mDstAmount, convert_all_); // Must subtract liquidity in default path from remaining amount. diff --git a/src/xrpld/rpc/handlers/orderbook/PathFind.cpp b/src/xrpld/rpc/handlers/orderbook/PathFind.cpp index 9aa84ac225..b40c74eedc 100644 --- a/src/xrpld/rpc/handlers/orderbook/PathFind.cpp +++ b/src/xrpld/rpc/handlers/orderbook/PathFind.cpp @@ -18,8 +18,13 @@ Json::Value doPathFind(RPC::JsonContext& context) { using namespace telemetry; - [[maybe_unused]] auto span = SpanGuard::span( + auto span = SpanGuard::span( TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::request); + if (auto const& src = context.params[jss::source_account]; src.isString()) + span.setAttribute(pathfind_span::attr::sourceAccount, src.asString()); + if (auto const& dst = context.params[jss::destination_account]; dst.isString()) + span.setAttribute(pathfind_span::attr::destAccount, dst.asString()); + if (context.app.config().PATH_SEARCH_MAX == 0) return rpcError(rpcNOT_SUPPORTED); diff --git a/src/xrpld/rpc/handlers/orderbook/RipplePathFind.cpp b/src/xrpld/rpc/handlers/orderbook/RipplePathFind.cpp index ff4ed3eb39..0bc751fcb1 100644 --- a/src/xrpld/rpc/handlers/orderbook/RipplePathFind.cpp +++ b/src/xrpld/rpc/handlers/orderbook/RipplePathFind.cpp @@ -26,8 +26,13 @@ Json::Value doRipplePathFind(RPC::JsonContext& context) { using namespace telemetry; - [[maybe_unused]] auto span = SpanGuard::span( + auto span = SpanGuard::span( TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::request); + if (auto const& src = context.params[jss::source_account]; src.isString()) + span.setAttribute(pathfind_span::attr::sourceAccount, src.asString()); + if (auto const& dst = context.params[jss::destination_account]; dst.isString()) + span.setAttribute(pathfind_span::attr::destAccount, dst.asString()); + if (context.app.config().PATH_SEARCH_MAX == 0) return rpcError(rpcNOT_SUPPORTED); From 4c4c6f5de230c3047ce236f2de7f060b05b457d5 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 28 May 2026 11:35:41 +0100 Subject: [PATCH 6/6] build fixes Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- src/xrpld/app/main/GRPCServer.cpp | 3 ++- src/xrpld/rpc/detail/RPCHandler.cpp | 4 +++- src/xrpld/rpc/detail/ServerHandler.cpp | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/xrpld/app/main/GRPCServer.cpp b/src/xrpld/app/main/GRPCServer.cpp index d0019b79f9..fb92a55dde 100644 --- a/src/xrpld/app/main/GRPCServer.cpp +++ b/src/xrpld/app/main/GRPCServer.cpp @@ -192,7 +192,8 @@ GRPCServerImpl::CallData::process(std::shared_ptrerror, ok->success. span.setAttribute( - rpc_span::attr::rpcStatus, ret ? rpc_span::val::error : rpc_span::val::success); + rpc_span::attr::rpcStatus, + ret ? std::string_view(rpc_span::val::error) + : std::string_view(rpc_span::val::success)); if (!ret) span.setOk(); return ret; diff --git a/src/xrpld/rpc/detail/ServerHandler.cpp b/src/xrpld/rpc/detail/ServerHandler.cpp index 4f1918076c..dca713b1a7 100644 --- a/src/xrpld/rpc/detail/ServerHandler.cpp +++ b/src/xrpld/rpc/detail/ServerHandler.cpp @@ -355,7 +355,7 @@ ServerHandler::onWSMessage( Json::Value jvResult(Json::objectValue); jvResult[jss::type] = jss::error; jvResult[jss::error] = "jsonInvalid"; - jvResult[jss::value] = buffers_to_string(buffers); + jvResult[jss::value] = ::xrpl::buffers_to_string(buffers); boost::beast::multi_buffer sb; Json::stream(jvResult, [&sb](auto const p, auto const n) { sb.commit(boost::asio::buffer_copy(sb.prepare(n), boost::asio::buffer(p, n))); @@ -579,7 +579,7 @@ ServerHandler::processSession( processRequest( session->port(), - buffers_to_string(session->request().body().data()), + ::xrpl::buffers_to_string(session->request().body().data()), session->remoteAddress().at_port(0), makeOutput(*session), coro,