From a104140a517c6c43617a4ab3bc30a0e6d3e24f92 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 27 May 2026 16:46:35 +0100 Subject: [PATCH 1/9] addressing code review comments Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- docs/build/telemetry.md | 3 +- include/xrpl/protocol/detail/features.macro | 1 - include/xrpl/telemetry/Telemetry.h | 33 +++++++++++---------- src/libxrpl/telemetry/SpanGuard.cpp | 3 +- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/docs/build/telemetry.md b/docs/build/telemetry.md index e8d7fb2dd5..eff1a6848b 100644 --- a/docs/build/telemetry.md +++ b/docs/build/telemetry.md @@ -36,7 +36,8 @@ such as Grafana Tempo. Telemetry is **off by default** at both compile time and runtime: - **Compile time**: The Conan option `telemetry` and CMake option `telemetry` must be set to `True`/`ON`. - When disabled, all tracing macros compile to `((void)0)` with zero overhead. + When disabled, all `SpanGuard` calls compile to inline no-ops (defined in `SpanGuard.h`) + with zero overhead — no OTel SDK dependency required. - **Runtime**: The `[telemetry]` config section must set `enabled=1`. When disabled at runtime, a no-op implementation is used. diff --git a/include/xrpl/protocol/detail/features.macro b/include/xrpl/protocol/detail/features.macro index bad43dd6ed..494b3fa6cd 100644 --- a/include/xrpl/protocol/detail/features.macro +++ b/include/xrpl/protocol/detail/features.macro @@ -15,7 +15,6 @@ // Add new amendments to the top of this list. // Keep it sorted in reverse chronological order. -XRPL_FIX (Cleanup3_2_0, Supported::no, VoteBehavior::DefaultNo) XRPL_FEATURE(MPTokensV2, Supported::no, VoteBehavior::DefaultNo) XRPL_FIX (Security3_1_3, Supported::no, VoteBehavior::DefaultNo) XRPL_FIX (PermissionedDomainInvariant, Supported::yes, VoteBehavior::DefaultNo) diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index 1d69e01a43..ae9303a1db 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -35,32 +35,33 @@ Usage examples: - 1. Check before tracing (typical guard pattern): + 1. Root span at a subsystem entry point (typical usage): @code - auto& telemetry = registry.getTelemetry(); - if (telemetry.isEnabled() && telemetry.shouldTraceRpc()) - { - auto span = telemetry.startSpan("rpc.command.server_info"); - // ... do work, span ends when shared_ptr refcount drops to 0 - } + // In an RPC handler dispatch: + auto guard = SpanGuard::span(TraceCategory::Rpc, "rpc", commandName); + guard.setAttribute("xrpl.rpc.command", commandName); + // ... process request + // guard destructor automatically ends the span on scope exit @endcode - 2. RAII tracing with SpanGuard (preferred): + 2. Child span for a sub-operation (cross-scope): @code - if (telemetry.isEnabled() && telemetry.shouldTraceRpc()) + auto parent = SpanGuard::span(TraceCategory::Transactions, "tx", "process"); { - SpanGuard guard(telemetry.startSpan("rpc.command.submit")); - guard.setAttribute("xrpl.rpc.command", "submit"); - // ... guard ends span automatically on scope exit + auto child = parent.childSpan("tx.apply"); + child.setAttribute("xrpl.tx.type", txType); + // child ends here } + // parent continues, then ends here @endcode 3. Cross-thread context propagation: @code - // On thread A: capture context - auto ctx = guard.context(); - // On thread B: create child span with explicit parent - auto child = telemetry.startSpan("async.work", ctx); + // Thread A: capture the active context while span is in scope + auto ctx = parentGuard.captureContext(); + + // Thread B: create child span with explicit parent + auto child = SpanGuard::childSpan("async.work", ctx); @endcode @note Thread safety: The Telemetry interface is safe for concurrent reads diff --git a/src/libxrpl/telemetry/SpanGuard.cpp b/src/libxrpl/telemetry/SpanGuard.cpp index 11eda56ed6..90bb139b97 100644 --- a/src/libxrpl/telemetry/SpanGuard.cpp +++ b/src/libxrpl/telemetry/SpanGuard.cpp @@ -35,6 +35,7 @@ #include #include +#include #include namespace xrpl { @@ -327,7 +328,7 @@ SpanGuard::recordException(std::exception const& e) return; impl_->span->AddEvent( "exception", - {{"exception.type", "std::exception"}, {"exception.message", std::string(e.what())}}); + {{"exception.type", typeid(e).name()}, {"exception.message", std::string(e.what())}}); impl_->span->SetStatus(otel_trace::StatusCode::kError, e.what()); } From 6aa8570d6ce52cb6d15cbe346fc6a7e1a0d585e1 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 27 May 2026 17:36:06 +0100 Subject: [PATCH 2/9] addressed code review comments. Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- include/xrpl/telemetry/SpanNames.h | 1 + include/xrpl/telemetry/Telemetry.h | 35 ++++++++++++++------------ src/xrpld/app/main/GRPCServer.cpp | 8 ++++++ src/xrpld/app/main/GrpcSpanNames.h | 2 ++ src/xrpld/rpc/detail/RPCHandler.cpp | 7 +++++- src/xrpld/rpc/detail/ServerHandler.cpp | 3 +++ 6 files changed, 39 insertions(+), 17 deletions(-) diff --git a/include/xrpl/telemetry/SpanNames.h b/include/xrpl/telemetry/SpanNames.h index 3c8b6144f9..9fdc37cd64 100644 --- a/include/xrpl/telemetry/SpanNames.h +++ b/include/xrpl/telemetry/SpanNames.h @@ -20,6 +20,7 @@ * - Per-span attribute keys: bare field name (span name carries the domain). * - Collision qualifier: _ when bare name collides across * domains or with OTel reserved `status` (e.g. rpc_status, grpc_status). + * - Shared cross-span attributes: _ (underscore) form. * - Resource attribute keys: xrpl.. (process-identity). * - Span prefixes: [.]. */ diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index f648289569..0cb217bd29 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -3,13 +3,15 @@ /** Abstract interface for OpenTelemetry distributed tracing. Provides the Telemetry base class that all components use to create trace - spans. Two concrete implementations exist, selected at construction time + spans. Three concrete implementations exist, selected at construction time by make_Telemetry(): - TelemetryImpl (Telemetry.cpp): real OTel SDK integration, compiled only when XRPL_ENABLE_TELEMETRY is defined and enabled at runtime. - NullTelemetry (NullTelemetry.cpp): no-op stub used when telemetry is disabled at compile time or runtime. + - NullTelemetryOtel (Telemetry.cpp): no-op stub that still depends on + the OTel API (used during transition or for testing). Inheritance / dependency diagram: @@ -37,32 +39,33 @@ 1. Root span at a subsystem entry point (typical usage): @code + #include + using namespace xrpl::telemetry; + // In an RPC handler dispatch: - auto guard = SpanGuard::span(TraceCategory::Rpc, "rpc", commandName); - guard.setAttribute("xrpl.rpc.command", commandName); + auto guard = SpanGuard::span( + TraceCategory::Rpc, rpc_span::prefix::command, commandName); + guard.setAttribute(rpc_span::attr::command, commandName); // ... process request // guard destructor automatically ends the span on scope exit @endcode - 2. Child span for a sub-operation (cross-scope): - @code - auto parent = SpanGuard::span(TraceCategory::Transactions, "tx", "process"); - { - SpanGuard guard(telemetry.startSpan("rpc.command.submit")); - guard.setAttribute("command", "submit"); - // ... guard ends span automatically on scope exit - } - @endcode - - 3. Child span for a sub-operation (cross-scope): + 2. Child span for a sub-operation (scoped child): @code auto parent = SpanGuard::span(TraceCategory::Transactions, "tx", "process"); { auto child = parent.childSpan("tx.apply"); - child.setAttribute("xrpl.tx.type", txType); + child.setAttribute("tx_type", txType); // child ends here } - // parent continues, then ends here + @endcode + + 3. Unrelated span (cross-scope, same thread): + @code + // Transactions and RPC can be active simultaneously + auto txSpan = SpanGuard::span(TraceCategory::Transactions, "tx", "process"); + auto rpcSpan = SpanGuard::span(TraceCategory::Rpc, "rpc", "info"); + // both spans end on scope exit @endcode 4. Cross-thread context propagation: diff --git a/src/xrpld/app/main/GRPCServer.cpp b/src/xrpld/app/main/GRPCServer.cpp index eeb4797b19..92760d641f 100644 --- a/src/xrpld/app/main/GRPCServer.cpp +++ b/src/xrpld/app/main/GRPCServer.cpp @@ -179,6 +179,7 @@ GRPCServerImpl::CallData::process(std::shared_ptr::process(std::shared_ptrerror, ok->success. + span.setAttribute( + rpc_span::attr::rpcStatus, ret ? rpc_span::val::error : rpc_span::val::success); + if (!ret) + span.setOk(); return ret; } catch (std::exception& e) diff --git a/src/xrpld/rpc/detail/ServerHandler.cpp b/src/xrpld/rpc/detail/ServerHandler.cpp index a181ec56cd..4f1918076c 100644 --- a/src/xrpld/rpc/detail/ServerHandler.cpp +++ b/src/xrpld/rpc/detail/ServerHandler.cpp @@ -564,6 +564,7 @@ ServerHandler::processSession( jr[jss::api_version] = jv[jss::api_version]; jr[jss::type] = jss::response; + span.setOk(); return jr; } @@ -598,6 +599,7 @@ ServerHandler::processSession( { session->close(true); } + span.setOk(); } static Json::Value @@ -1036,6 +1038,7 @@ ServerHandler::processRequest( } } + span.setOk(); HTTPReply(httpStatus, response, output, rpcJ); } From f6f0cb1a5fd978cf074971a5456af08d243d0eff Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 27 May 2026 17:55:09 +0100 Subject: [PATCH 3/9] updated class comment Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- include/xrpl/telemetry/SpanNames.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/xrpl/telemetry/SpanNames.h b/include/xrpl/telemetry/SpanNames.h index 9fdc37cd64..fe12e3bdc5 100644 --- a/include/xrpl/telemetry/SpanNames.h +++ b/include/xrpl/telemetry/SpanNames.h @@ -20,8 +20,13 @@ * - Per-span attribute keys: bare field name (span name carries the domain). * - Collision qualifier: _ when bare name collides across * domains or with OTel reserved `status` (e.g. rpc_status, grpc_status). - * - Shared cross-span attributes: _ (underscore) form. - * - Resource attribute keys: xrpl.. (process-identity). + * - Shared cross-span attributes: _ (underscore) form + * (e.g. tx_hash, peer_id, ledger_seq, consensus_round). + * - Resource attribute keys: xrpl.. (dotted) form is + * RESERVED for process-identity attributes set once at startup on the + * OTel resource (e.g. xrpl.network.id, xrpl.network.type). Do not use + * this form for span attributes — it parses awkwardly in TraceQL and + * blurs the resource/span scope distinction. * - Span prefixes: [.]. */ From bfdcd3da87c5b629dd720ed3b1ebb256dad69e0e Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Wed, 27 May 2026 18:26:29 +0100 Subject: [PATCH 4/9] fix(telemetry): use resolved command/method name as span suffix Per PR #6438 review thread r3250432621: known-command errors (rpcTOO_BUSY, rpcNO_PERMISSION, etc.) were collapsing into a single rpc.command.unknown span name, hiding per-command error rates in dashboards. Same anti-pattern existed for gRPC, where every method was bucketed under grpc.request with the method relegated to an attribute. - RPCHandler.cpp: doCommand error path uses cmdName as the span suffix; the rpc_span::val::unknownCommand fallback only applies when the request truly omits both command and method fields. - GRPCServer.cpp: gRPC span name is now grpc. (e.g. grpc.GetLedger). Method also retained as an attribute. - GrpcSpanNames.h: drop the unused op::request constant; update the span-hierarchy comment. - RpcSpanNames.h: update the gRPC span diagram to match. Dashboards on downstream phases will benefit from per-command breakdowns without needing TraceQL attribute filters. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/xrpld/app/main/GRPCServer.cpp | 3 +-- src/xrpld/app/main/GrpcSpanNames.h | 14 ++++++-------- src/xrpld/rpc/detail/RPCHandler.cpp | 7 +++++-- src/xrpld/rpc/detail/RpcSpanNames.h | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/xrpld/app/main/GRPCServer.cpp b/src/xrpld/app/main/GRPCServer.cpp index 92760d641f..d0019b79f9 100644 --- a/src/xrpld/app/main/GRPCServer.cpp +++ b/src/xrpld/app/main/GRPCServer.cpp @@ -169,8 +169,7 @@ void GRPCServerImpl::CallData::process(std::shared_ptr coro) { using namespace telemetry; - auto span = - SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, grpc_span::op::request); + auto span = SpanGuard::span(TraceCategory::Rpc, grpc_span::prefix::grpc, name_); span.setAttribute(grpc_span::attr::method, name_); try diff --git a/src/xrpld/app/main/GrpcSpanNames.h b/src/xrpld/app/main/GrpcSpanNames.h index c2f40ee53d..2532b3812d 100644 --- a/src/xrpld/app/main/GrpcSpanNames.h +++ b/src/xrpld/app/main/GrpcSpanNames.h @@ -9,13 +9,16 @@ * Span hierarchy: * * +-------------------------------------------------------+ - * | grpc.request | + * | grpc. (e.g. grpc.GetLedger) | * | CallData::process(coro) | * | attrs: method, grpc_role, grpc_status | * +-------------------------------------------------------+ * * Unlike the HTTP/WS RPC path, gRPC has a flat single-span structure * per request since each CallData handles exactly one RPC method. + * The method name is embedded in the span name (rather than only as + * an attribute) so dashboards can break out per-method latency and + * error rates without needing TraceQL attribute filters. */ #include @@ -25,16 +28,11 @@ namespace xrpl::telemetry::grpc_span { // ===== Span prefixes ======================================================= namespace prefix { -/// "grpc" — root prefix for gRPC transport spans. +/// "grpc" — root prefix for gRPC transport spans. The full span name is +/// formed at the call site as `grpc.` (see GRPCServer.cpp). inline constexpr auto grpc = makeStr("grpc"); } // namespace prefix -// ===== Span operation suffixes ============================================= - -namespace op { -inline constexpr auto request = makeStr("request"); -} // namespace op - // ===== Attribute keys ====================================================== namespace attr { diff --git a/src/xrpld/rpc/detail/RPCHandler.cpp b/src/xrpld/rpc/detail/RPCHandler.cpp index e8b6e6d26c..1eca97cd52 100644 --- a/src/xrpld/rpc/detail/RPCHandler.cpp +++ b/src/xrpld/rpc/detail/RPCHandler.cpp @@ -229,8 +229,11 @@ doCommand(RPC::JsonContext& context, Json::Value& result) { cmdName = "unknown"; } - auto span = SpanGuard::span( - TraceCategory::Rpc, rpc_span::prefix::command, rpc_span::val::unknownCommand); + // Use the resolved command name as the span suffix so dashboards + // can break out per-command error rates (e.g. rpc.command.submit + // for a submit that hit rpcTOO_BUSY). Falling back to a single + // "unknown" name only when the request truly omits both fields. + auto span = SpanGuard::span(TraceCategory::Rpc, rpc_span::prefix::command, cmdName); span.setAttribute(rpc_span::attr::command, cmdName.c_str()); span.setError(get_error_info(error).token.c_str()); diff --git a/src/xrpld/rpc/detail/RpcSpanNames.h b/src/xrpld/rpc/detail/RpcSpanNames.h index c99c153f11..62eba3de71 100644 --- a/src/xrpld/rpc/detail/RpcSpanNames.h +++ b/src/xrpld/rpc/detail/RpcSpanNames.h @@ -86,9 +86,9 @@ * gRPC path (see GrpcSpanNames.h for constants): * * +-------------------------------------------------------+ - * | grpc.request | + * | grpc. (e.g. grpc.GetLedger) | * | CallData::process(coro) | - * | attrs: method, grpc_status | + * | attrs: method, grpc_status | * +-------------------------------------------------------+ * * Covered paths: From 9498b2865fff9dce482f72b11b5837ad8cfde13c Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 28 May 2026 11:27:29 +0100 Subject: [PATCH 5/9] fix(telemetry): address PR #6424 review comments - Drop xrpl.node.amendment_blocked / xrpl.node.server_state from telemetry surface (constants in SpanNames.h, two filters in tempo.yaml). Operators read the same data via server_info / server_state RPC; OTel SDK 1.18.0 cannot refresh resource attrs at runtime so resource-level emission was not viable either. - Namespace all pathfind span attributes under pathfind_* (underscore form per Phase 1c rule 5). Renames in PathFindSpanNames.h and call sites in PathRequest.cpp, PathRequestManager.cpp, plus the rule-5 retention xrpl.pathfind.ledger_index -> pathfind_ledger_index. - Wire pathfind_source_account / pathfind_dest_account on pathfind.request in doPathFind / doRipplePathFind handlers (only when present + string). - Collapse per-asset pathfind.discover / pathfind.rank spans into one pathfind.discover hoisted around the per-source-asset loop in PathRequest::findPaths. Span count goes from 2N to 1 per RPC call; per-asset breakdown traded for bounded storage and cardinality. Trade-off documented inline. - Fix pathfind_num_paths semantics: now sums getBestPaths().size() across the loop (paths actually returned) instead of the maxPaths input cap. - PathRequestManager::updateAll: move span creation after the locked requests_ snapshot, early-return when no active subscriptions exist (avoids empty span on every ledger close), set pathfind_num_requests = requests.size(). - Update Phase2_taskList.md and 02-design-decisions.md to match. Co-Authored-By: Claude Opus 4.7 (1M context) --- OpenTelemetryPlan/02-design-decisions.md | 28 +++--- OpenTelemetryPlan/Phase2_taskList.md | 57 +++--------- .../provisioning/datasources/tempo.yaml | 11 --- include/xrpl/telemetry/SpanNames.h | 9 -- src/xrpld/rpc/detail/PathFindSpanNames.h | 91 ++++++++++--------- src/xrpld/rpc/detail/PathRequest.cpp | 18 ++++ src/xrpld/rpc/detail/PathRequestManager.cpp | 17 +++- src/xrpld/rpc/detail/Pathfinder.cpp | 12 --- src/xrpld/rpc/handlers/orderbook/PathFind.cpp | 7 +- .../rpc/handlers/orderbook/RipplePathFind.cpp | 7 +- 10 files changed, 116 insertions(+), 141 deletions(-) diff --git a/OpenTelemetryPlan/02-design-decisions.md b/OpenTelemetryPlan/02-design-decisions.md index 6523a53d09..7ced4aaf2b 100644 --- a/OpenTelemetryPlan/02-design-decisions.md +++ b/OpenTelemetryPlan/02-design-decisions.md @@ -346,20 +346,20 @@ resource::SemanticConventions::SERVICE_INSTANCE_ID = The following table summarizes what data is collected by category: -| Category | Attributes Collected | Purpose | -| --------------- | ---------------------------------------------------------------------- | ---------------------------- | -| **Transaction** | `tx.hash`, `tx.type`, `tx.result`, `tx.fee`, `ledger_index` | Trace transaction lifecycle | -| **Consensus** | `round`, `phase`, `mode`, `proposers` (public keys), `duration_ms` | Analyze consensus timing | -| **RPC** | `command`, `version`, `status`, `duration_ms` | Monitor RPC performance | -| **Peer** | `peer.id` (public key), `latency_ms`, `message.type`, `message.size` | Network topology analysis | -| **Ledger** | `ledger.hash`, `ledger.index`, `close_time`, `tx_count` | Ledger progression tracking | -| **Job** | `job.type`, `queue_ms`, `worker` | JobQueue performance | -| **PathFinding** | `pathfind.source_currency`, `dest_currency`, `path_count`, `cache_hit` | Payment path analysis | -| **TxQ** | `txq.queue_depth`, `fee_level`, `eviction_reason` | Queue depth and fee tracking | -| **Fee** | `fee.load_factor`, `escalation_level` | Fee escalation monitoring | -| **Validator** | `validator.list_size`, `list_age_sec` | UNL health monitoring | -| **Amendment** | `amendment.name`, `status` | Protocol upgrade tracking | -| **SHAMap** | `shamap.type`, `missing_nodes`, `duration_ms` | State tree sync performance | +| Category | Attributes Collected | Purpose | +| --------------- | ---------------------------------------------------------------------------------------------------------------- | ---------------------------- | +| **Transaction** | `tx.hash`, `tx.type`, `tx.result`, `tx.fee`, `ledger_index` | Trace transaction lifecycle | +| **Consensus** | `round`, `phase`, `mode`, `proposers` (public keys), `duration_ms` | Analyze consensus timing | +| **RPC** | `command`, `version`, `status`, `duration_ms` | Monitor RPC performance | +| **Peer** | `peer.id` (public key), `latency_ms`, `message.type`, `message.size` | Network topology analysis | +| **Ledger** | `ledger.hash`, `ledger.index`, `close_time`, `tx_count` | Ledger progression tracking | +| **Job** | `job.type`, `queue_ms`, `worker` | JobQueue performance | +| **PathFinding** | `pathfind_fast`, `pathfind_search_level`, `pathfind_num_paths`, `pathfind_ledger_index`, `pathfind_num_requests` | Payment path analysis | +| **TxQ** | `txq.queue_depth`, `fee_level`, `eviction_reason` | Queue depth and fee tracking | +| **Fee** | `fee.load_factor`, `escalation_level` | Fee escalation monitoring | +| **Validator** | `validator.list_size`, `list_age_sec` | UNL health monitoring | +| **Amendment** | `amendment.name`, `status` | Protocol upgrade tracking | +| **SHAMap** | `shamap.type`, `missing_nodes`, `duration_ms` | State tree sync performance | ### 2.4.4 Privacy & Sensitive Data Policy diff --git a/OpenTelemetryPlan/Phase2_taskList.md b/OpenTelemetryPlan/Phase2_taskList.md index 6979f78869..6b64210e6b 100644 --- a/OpenTelemetryPlan/Phase2_taskList.md +++ b/OpenTelemetryPlan/Phase2_taskList.md @@ -1,8 +1,8 @@ # Phase 2: RPC Tracing Completion Task List -> **Goal**: Complete RPC tracing coverage with unit tests, Grafana search filters, node health attributes, and config hardening. Build on the Phase 1c SpanGuard factory foundation to achieve production-quality RPC observability. +> **Goal**: Complete RPC tracing coverage with unit tests, Grafana search filters, PathFind instrumentation, and config hardening. Build on the Phase 1c SpanGuard factory foundation to achieve production-quality RPC observability. > -> **Scope**: Unit tests for core telemetry, Grafana Tempo search filters, node health span attributes, config validation (`std::clamp`). +> **Scope**: Unit tests for core telemetry, Grafana Tempo search filters, PathFind RPC tracing, config validation (`std::clamp`). > > **Branch**: `pratik/otel-phase2-rpc-tracing` (from `pratik/otel-phase1c-rpc-integration`) @@ -121,42 +121,9 @@ These can be added later if dashboard queries specifically need them. The node h ## Task 2.8: RPC Span Attribute Enrichment — Node Health Context -> **Source**: [External Dashboard Parity](../docs/superpowers/specs/2026-03-30-external-dashboard-parity-design.md) — adds node-level health context inspired by the community [xrpl-validator-dashboard](https://github.com/realgrapedrop/xrpl-validator-dashboard). -> -> **Downstream**: Phase 7 (MetricsRegistry uses these attributes for alerting context), Phase 10 (validation checks for these attributes). +**Status**: DROPPED. -**Objective**: Add node-level health state to every `rpc.command.*` span so operators can correlate RPC behavior with node state in Tempo. - -**What to do**: - -- Edit `src/xrpld/rpc/detail/RPCHandler.cpp`: - - In the `rpc.command.*` span creation block (after existing `setAttribute` calls for `command`, `version`, etc.): - - Node health attrs (`xrpl.node.amendment_blocked`, `xrpl.node.server_state`) are now resource-level attrs, not per-span. They are set at Tracer init. - -**New span attributes**: - -| Attribute | Type | Source | Example | -| ----------------------------- | ------ | ------------------------------------------- | -------- | -| `xrpl.node.amendment_blocked` | bool | `context.app.getOPs().isAmendmentBlocked()` | `true` | -| `xrpl.node.server_state` | string | `context.app.getOPs().strOperatingMode()` | `"full"` | - -**Rationale**: When a node is amendment-blocked or in a degraded state, every RPC response is suspect. Tagging spans with this state enables Tempo TraceQL queries like: - -``` -{name=~"rpc.command.*"} | xrpl.node.amendment_blocked = true -``` - -This surfaces all RPCs served during a blocked period — critical for post-incident analysis. - -**Key modified files**: - -- `src/xrpld/rpc/detail/RPCHandler.cpp` - -**Exit Criteria**: - -- [ ] `rpc.command.server_info` spans carry `xrpl.node.amendment_blocked` and `xrpl.node.server_state` attributes -- [ ] No measurable latency impact (attribute values are cached atomics, not computed per-call) -- [ ] Attributes appear in Tempo trace detail view +Node health (`amendment_blocked`, `server_state`) is not part of the telemetry surface. Operators consume the same data via the existing `server_info` / `server_state` RPC commands, so duplicating it on traces adds storage and cardinality cost without new value. The OTel C++ SDK 1.18.0 also does not support runtime updates to the resource, ruling out resource-level emission of these dynamic-by-nature flags. --- @@ -169,10 +136,11 @@ This surfaces all RPCs served during a blocked period — critical for post-inci **Spans added**: - `pathfind.request` — wraps `doPathFind()` and `doRipplePathFind()` RPC handlers -- `pathfind.compute` — wraps `PathRequest::doUpdate()` (fast/normal attr) -- `pathfind.update_all` — wraps `PathRequestManager::updateAll()` on ledger close (ledger_index attr) -- `pathfind.discover` — wraps `Pathfinder::findPaths()` graph exploration (search_level attr) -- `pathfind.rank` — wraps `Pathfinder::computePathRanks()` liquidity validation (num_paths attr) +- `pathfind.compute` — wraps `PathRequest::doUpdate()` (`pathfind_fast` attr) +- `pathfind.update_all` — wraps `PathRequestManager::updateAll()` on ledger close (`pathfind_ledger_index`, `pathfind_num_requests` attrs; emitted only when active subscriptions exist) +- `pathfind.discover` — wraps the entire per-source-asset loop in `PathRequest::findPaths()` (`pathfind_search_level`, `pathfind_num_paths` attrs). One span per RPC call instead of N (one per source asset). Trade-off: per-asset breakdown is lost; storage and cardinality bounded. + +**Attribute namespacing**: All pathfind attributes use the `pathfind_*` underscore form per the Phase 1c naming-spec rule 5. **New file**: `src/xrpld/rpc/detail/PathFindSpanNames.h` @@ -197,9 +165,10 @@ This surfaces all RPCs served during a blocked period — critical for post-inci | 2.5 | Enhanced RPC span attributes (HTTP-level) | Deferred | Low value; span duration covers timing natively | | 2.6 | Build verification and performance baseline | Complete | Verified in CI on Phase 1c | | 2.7 | Grafana Tempo search filters | Complete | rpc-command, rpc-status, rpc-role filters | -| 2.8 | RPC span attribute enrichment (node health) | Complete | amendment_blocked + server_state | -| 2.9 | PathFind RPC instrumentation (5 spans) | Complete | request, compute, update_all, discover, rank | +| 2.8 | RPC span attribute enrichment (node health) | Dropped | Available via `server_info`/`server_state` RPC | +| 2.9 | PathFind RPC instrumentation | Complete | request, compute, update_all, discover | -**Delivered in this branch**: Tasks 2.4, 2.7, 2.8, 2.9. +**Delivered in this branch**: Tasks 2.4, 2.7, 2.9. **Deferred with rationale**: Tasks 2.1 (→Phase 3), 2.5 (low priority). +**Dropped**: Task 2.8 (node health not duplicated on traces). **Superseded**: Task 2.2 (Phase 1c SpanGuard factory covers this). diff --git a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml index 7f4bd3684f..1af993961a 100644 --- a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml +++ b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml @@ -106,14 +106,3 @@ datasources: operator: "=" scope: span type: dynamic - # Phase 2: Node health filters (Task 2.8) — resource attributes - - id: node-amendment-blocked - tag: xrpl.node.amendment_blocked - operator: "=" - scope: resource - type: static - - id: node-server-state - tag: xrpl.node.server_state - operator: "=" - scope: resource - type: dynamic diff --git a/include/xrpl/telemetry/SpanNames.h b/include/xrpl/telemetry/SpanNames.h index 8ea05fe2e0..fe12e3bdc5 100644 --- a/include/xrpl/telemetry/SpanNames.h +++ b/include/xrpl/telemetry/SpanNames.h @@ -108,15 +108,6 @@ namespace attr { inline constexpr auto networkId = join(join(seg::xrpl, seg::network), makeStr("id")); inline constexpr auto networkType = join(join(seg::xrpl, seg::network), makeStr("type")); inline constexpr auto linkType = makeStr("link_type"); - -/// Node health attributes — RESOURCE-ONLY (process identity, not per-span). -/// Set at Tracer init via resource::Resource::Create and refreshed on state -/// transitions. Do NOT use with span.setAttribute(). -inline constexpr auto xrplNode = join(seg::xrpl, makeStr("node")); -/// "xrpl.node.amendment_blocked" — resource attribute key. -inline constexpr auto nodeAmendmentBlocked = join(xrplNode, makeStr("amendment_blocked")); -/// "xrpl.node.server_state" — resource attribute key. -inline constexpr auto nodeServerState = join(xrplNode, makeStr("server_state")); } // namespace attr // ===== Shared attribute values ============================================= diff --git a/src/xrpld/rpc/detail/PathFindSpanNames.h b/src/xrpld/rpc/detail/PathFindSpanNames.h index af61340b6f..17a1c12858 100644 --- a/src/xrpld/rpc/detail/PathFindSpanNames.h +++ b/src/xrpld/rpc/detail/PathFindSpanNames.h @@ -9,34 +9,36 @@ * * RPC entry (one-shot or subscription): * - * +-------------------------------------------------------+ - * | pathfind.request | - * | doPathFind() / doRipplePathFind() | - * | attrs: source_account, dest_account | - * | | - * | +--------------------------------------------------+ | - * | | pathfind.compute | | - * | | PathRequest::doUpdate() | | - * | | attrs: fast, search_level | | - * | | | | - * | | +---------------------+ +--------------------+ | | - * | | | pathfind.discover | | pathfind.rank | | | - * | | | Pathfinder::find() | | computePathRanks() | | | - * | | +---------------------+ +--------------------+ | | - * | +--------------------------------------------------+ | - * +-------------------------------------------------------+ + * +----------------------------------------------------------------+ + * | pathfind.request | + * | doPathFind() / doRipplePathFind() | + * | attrs: pathfind_source_account, pathfind_dest_account | + * | (set when present in request params) | + * | | + * | +-----------------------------------------------------------+ | + * | | pathfind.compute | | + * | | PathRequest::doUpdate() | | + * | | attrs: pathfind_fast | | + * | | | | + * | | +-----------------------------------------------------+ | | + * | | | pathfind.discover (one per RPC call, hoisted above | | + * | | | the per-source-asset loop in PathRequest::findPaths)| | + * | | | attrs: pathfind_search_level, pathfind_num_paths | | + * | | +-----------------------------------------------------+ | | + * | +-----------------------------------------------------------+ | + * +----------------------------------------------------------------+ * * Async recomputation (ledger close): * - * +-------------------------------------------------------+ - * | pathfind.update_all | - * | PathRequestManager::updateAll() | - * | attrs: ledger_index, num_requests | - * | | - * | +--------------------------------------------------+ | - * | | pathfind.compute (per active request) | | - * | +--------------------------------------------------+ | - * +-------------------------------------------------------+ + * +----------------------------------------------------------------+ + * | pathfind.update_all | + * | PathRequestManager::updateAll() | + * | attrs: pathfind_ledger_index, pathfind_num_requests | + * | | + * | +-----------------------------------------------------------+ | + * | | pathfind.compute (per active request) | | + * | +-----------------------------------------------------------+ | + * +----------------------------------------------------------------+ */ #include @@ -57,30 +59,31 @@ inline constexpr auto request = makeStr("request"); inline constexpr auto compute = makeStr("compute"); inline constexpr auto updateAll = makeStr("update_all"); inline constexpr auto discover = makeStr("discover"); -inline constexpr auto rank = makeStr("rank"); } // namespace op // ===== Attribute keys ====================================================== +// +// All pathfind attributes are namespaced under `pathfind_*` (underscore form, +// per Phase 1c naming spec rule 5). Avoids collisions with bare keys like +// `fast` or `num_paths` that other subsystems may introduce. namespace attr { -/// "source_account" — originating account for path search. -inline constexpr auto sourceAccount = makeStr("source_account"); -/// "dest_account" — destination account. -inline constexpr auto destAccount = makeStr("dest_account"); -/// "fast" — whether fast pathfinding mode enabled. -inline constexpr auto fast = makeStr("fast"); -/// "search_level" — depth of graph exploration. -inline constexpr auto searchLevel = makeStr("search_level"); -/// "num_complete_paths" — complete paths found. -inline constexpr auto numCompletePaths = makeStr("num_complete_paths"); -/// "num_paths" — total paths returned. -inline constexpr auto numPaths = makeStr("num_paths"); -/// "num_requests" — active path requests. -inline constexpr auto numRequests = makeStr("num_requests"); -/// "xrpl.pathfind.ledger_index" — kept qualified (rule 5): pathfind target -/// ledger is distinct from xrpl.ledger.seq. -inline constexpr auto ledgerIndex = - join(join(seg::xrpl, makeStr("pathfind")), makeStr("ledger_index")); +/// "pathfind_source_account" — originating account for path search. +inline constexpr auto sourceAccount = makeStr("pathfind_source_account"); +/// "pathfind_dest_account" — destination account. +inline constexpr auto destAccount = makeStr("pathfind_dest_account"); +/// "pathfind_fast" — whether fast pathfinding mode enabled. +inline constexpr auto fast = makeStr("pathfind_fast"); +/// "pathfind_search_level" — depth of graph exploration. +inline constexpr auto searchLevel = makeStr("pathfind_search_level"); +/// "pathfind_num_paths" — total paths produced across the per-source-asset +/// loop in PathRequest::findPaths (sum of getBestPaths().size() per asset). +inline constexpr auto numPaths = makeStr("pathfind_num_paths"); +/// "pathfind_num_requests" — snapshot size of requests_ at update_all start +/// (may include weak_ptrs that subsequently expire during processing). +inline constexpr auto numRequests = makeStr("pathfind_num_requests"); +/// "pathfind_ledger_index" — pathfind target ledger index. +inline constexpr auto ledgerIndex = makeStr("pathfind_ledger_index"); } // namespace attr } // namespace xrpl::telemetry::pathfind_span diff --git a/src/xrpld/rpc/detail/PathRequest.cpp b/src/xrpld/rpc/detail/PathRequest.cpp index 8148879362..7b54872cbc 100644 --- a/src/xrpld/rpc/detail/PathRequest.cpp +++ b/src/xrpld/rpc/detail/PathRequest.cpp @@ -40,6 +40,7 @@ #include #include +#include #include #include #include @@ -579,6 +580,20 @@ PathRequest::findPaths( auto const dst_amount = convertAmount(saDstAmount, convert_all_); hash_map> pathasset_map; + + // One `pathfind.discover` span wraps the entire per-source-asset loop so + // that a single RPC call produces one discover span instead of N (one per + // candidate source asset). Trade-off: per-asset discovery/ranking timing + // is no longer split into individual spans — span count and Tempo storage + // are bounded per RPC at the cost of per-asset visibility. If per-asset + // breakdown is needed in the future, add child spans inside the loop body + // (`Pathfinder::findPaths`/`computePathRanks`) parented off this span. + using namespace telemetry; + auto span = SpanGuard::span( + TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::discover); + span.setAttribute(pathfind_span::attr::searchLevel, static_cast(level)); + + std::int64_t totalPaths = 0; for (auto const& asset : sourceAssets) { if (continueCallback && !continueCallback()) @@ -598,6 +613,7 @@ PathRequest::findPaths( auto ps = pathfinder->getBestPaths( max_paths_, fullLiquidityPath, mContext[asset], asset.getIssuer(), continueCallback); mContext[asset] = ps; + totalPaths += static_cast(ps.size()); auto const& sourceAccount = [&] { if (!isXRP(asset.getIssuer())) @@ -697,6 +713,8 @@ PathRequest::findPaths( } } + span.setAttribute(pathfind_span::attr::numPaths, totalPaths); + /* The resource fee is based on the number of source currencies used. The minimum cost is 50 and the maximum is 400. The cost increases after four source currencies, 50 - (4 * 4) = 34. diff --git a/src/xrpld/rpc/detail/PathRequestManager.cpp b/src/xrpld/rpc/detail/PathRequestManager.cpp index 16e5def7c3..9577e500de 100644 --- a/src/xrpld/rpc/detail/PathRequestManager.cpp +++ b/src/xrpld/rpc/detail/PathRequestManager.cpp @@ -61,11 +61,6 @@ PathRequestManager::getAssetCache(std::shared_ptr const& ledger, void PathRequestManager::updateAll(std::shared_ptr const& inLedger) { - using namespace telemetry; - auto span = SpanGuard::span( - TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::updateAll); - span.setAttribute(pathfind_span::attr::ledgerIndex, static_cast(inLedger->seq())); - auto event = app_.getJobQueue().makeLoadEvent(jtPATH_FIND, "PathRequest::updateAll"); std::vector requests; @@ -78,6 +73,18 @@ PathRequestManager::updateAll(std::shared_ptr const& inLedger) cache = getAssetCache(inLedger, true); } + // updateAll runs on every ledger close; skip span emission entirely when + // there are no active path subscriptions to avoid a steady stream of empty + // spans at mainnet close cadence. + if (requests.empty()) + return; + + using namespace telemetry; + auto span = SpanGuard::span( + TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::updateAll); + span.setAttribute(pathfind_span::attr::ledgerIndex, static_cast(inLedger->seq())); + span.setAttribute(pathfind_span::attr::numRequests, static_cast(requests.size())); + bool newRequests = app_.getLedgerMaster().isNewPathRequest(); bool mustBreak = false; diff --git a/src/xrpld/rpc/detail/Pathfinder.cpp b/src/xrpld/rpc/detail/Pathfinder.cpp index 1f1034739e..a31d4522a7 100644 --- a/src/xrpld/rpc/detail/Pathfinder.cpp +++ b/src/xrpld/rpc/detail/Pathfinder.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include #include @@ -30,7 +29,6 @@ #include #include #include -#include #include #include @@ -229,11 +227,6 @@ Pathfinder::Pathfinder( bool Pathfinder::findPaths(int searchLevel, std::function const& continueCallback) { - using namespace telemetry; - auto span = SpanGuard::span( - TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::discover); - span.setAttribute(pathfind_span::attr::searchLevel, static_cast(searchLevel)); - JLOG(j_.trace()) << "findPaths start"; if (mDstAmount == beast::zero) { @@ -444,11 +437,6 @@ Pathfinder::getPathLiquidity( void Pathfinder::computePathRanks(int maxPaths, std::function const& continueCallback) { - using namespace telemetry; - auto span = SpanGuard::span( - TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::rank); - span.setAttribute(pathfind_span::attr::numPaths, static_cast(maxPaths)); - mRemainingAmount = convertAmount(mDstAmount, convert_all_); // Must subtract liquidity in default path from remaining amount. diff --git a/src/xrpld/rpc/handlers/orderbook/PathFind.cpp b/src/xrpld/rpc/handlers/orderbook/PathFind.cpp index 9aa84ac225..b40c74eedc 100644 --- a/src/xrpld/rpc/handlers/orderbook/PathFind.cpp +++ b/src/xrpld/rpc/handlers/orderbook/PathFind.cpp @@ -18,8 +18,13 @@ Json::Value doPathFind(RPC::JsonContext& context) { using namespace telemetry; - [[maybe_unused]] auto span = SpanGuard::span( + auto span = SpanGuard::span( TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::request); + if (auto const& src = context.params[jss::source_account]; src.isString()) + span.setAttribute(pathfind_span::attr::sourceAccount, src.asString()); + if (auto const& dst = context.params[jss::destination_account]; dst.isString()) + span.setAttribute(pathfind_span::attr::destAccount, dst.asString()); + if (context.app.config().PATH_SEARCH_MAX == 0) return rpcError(rpcNOT_SUPPORTED); diff --git a/src/xrpld/rpc/handlers/orderbook/RipplePathFind.cpp b/src/xrpld/rpc/handlers/orderbook/RipplePathFind.cpp index ff4ed3eb39..0bc751fcb1 100644 --- a/src/xrpld/rpc/handlers/orderbook/RipplePathFind.cpp +++ b/src/xrpld/rpc/handlers/orderbook/RipplePathFind.cpp @@ -26,8 +26,13 @@ Json::Value doRipplePathFind(RPC::JsonContext& context) { using namespace telemetry; - [[maybe_unused]] auto span = SpanGuard::span( + auto span = SpanGuard::span( TraceCategory::Rpc, pathfind_span::prefix::pathfind, pathfind_span::op::request); + if (auto const& src = context.params[jss::source_account]; src.isString()) + span.setAttribute(pathfind_span::attr::sourceAccount, src.asString()); + if (auto const& dst = context.params[jss::destination_account]; dst.isString()) + span.setAttribute(pathfind_span::attr::destAccount, dst.asString()); + if (context.app.config().PATH_SEARCH_MAX == 0) return rpcError(rpcNOT_SUPPORTED); From 4c4c6f5de230c3047ce236f2de7f060b05b457d5 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 28 May 2026 11:35:41 +0100 Subject: [PATCH 6/9] build fixes Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- src/xrpld/app/main/GRPCServer.cpp | 3 ++- src/xrpld/rpc/detail/RPCHandler.cpp | 4 +++- src/xrpld/rpc/detail/ServerHandler.cpp | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/xrpld/app/main/GRPCServer.cpp b/src/xrpld/app/main/GRPCServer.cpp index d0019b79f9..fb92a55dde 100644 --- a/src/xrpld/app/main/GRPCServer.cpp +++ b/src/xrpld/app/main/GRPCServer.cpp @@ -192,7 +192,8 @@ GRPCServerImpl::CallData::process(std::shared_ptrerror, ok->success. span.setAttribute( - rpc_span::attr::rpcStatus, ret ? rpc_span::val::error : rpc_span::val::success); + rpc_span::attr::rpcStatus, + ret ? std::string_view(rpc_span::val::error) + : std::string_view(rpc_span::val::success)); if (!ret) span.setOk(); return ret; diff --git a/src/xrpld/rpc/detail/ServerHandler.cpp b/src/xrpld/rpc/detail/ServerHandler.cpp index 4f1918076c..dca713b1a7 100644 --- a/src/xrpld/rpc/detail/ServerHandler.cpp +++ b/src/xrpld/rpc/detail/ServerHandler.cpp @@ -355,7 +355,7 @@ ServerHandler::onWSMessage( Json::Value jvResult(Json::objectValue); jvResult[jss::type] = jss::error; jvResult[jss::error] = "jsonInvalid"; - jvResult[jss::value] = buffers_to_string(buffers); + jvResult[jss::value] = ::xrpl::buffers_to_string(buffers); boost::beast::multi_buffer sb; Json::stream(jvResult, [&sb](auto const p, auto const n) { sb.commit(boost::asio::buffer_copy(sb.prepare(n), boost::asio::buffer(p, n))); @@ -579,7 +579,7 @@ ServerHandler::processSession( processRequest( session->port(), - buffers_to_string(session->request().body().data()), + ::xrpl::buffers_to_string(session->request().body().data()), session->remoteAddress().at_port(0), makeOutput(*session), coro, From 43258e8dc08c11ff1b8755b3edc986abbc9b9086 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 28 May 2026 12:33:16 +0100 Subject: [PATCH 7/9] docs(telemetry): add secure-OTel pipeline analysis and link into plan Document the threat model and chosen hardening approach for the OTel pipeline: mTLS to the collector as primary defense (across-network deployment), NetworkPolicy as defense-in-depth, and source-side validation plus per-peer rate limiting for protocol::TraceContext on peer messages. Skips Basic Auth (wrong shape for multi-operator fleet) and HTTP-gateway header stripping (rippled is P2P). Wires the new doc into the master plan ToC, mermaid diagram, and body section, plus cross-refs from the privacy section in 02-design-decisions.md and the collector config in 05-configuration-reference.md so readers reach it from natural in-context entry points. Adds a backlink at the top of secure-OTel.md to the master plan. Adds 'exfiltration' and 'htpasswd' to cspell dictionary. Co-Authored-By: Claude Opus 4.7 (1M context) --- OpenTelemetryPlan/02-design-decisions.md | 2 + .../05-configuration-reference.md | 2 + OpenTelemetryPlan/08-appendix.md | 27 +- OpenTelemetryPlan/OpenTelemetryPlan.md | 12 + OpenTelemetryPlan/secure-OTel.md | 243 ++++++++++++++++++ cspell.config.yaml | 2 + 6 files changed, 275 insertions(+), 13 deletions(-) create mode 100644 OpenTelemetryPlan/secure-OTel.md diff --git a/OpenTelemetryPlan/02-design-decisions.md b/OpenTelemetryPlan/02-design-decisions.md index 3b5d99b4fe..fe2da83acc 100644 --- a/OpenTelemetryPlan/02-design-decisions.md +++ b/OpenTelemetryPlan/02-design-decisions.md @@ -433,6 +433,8 @@ redact_peer_address=1 # Remove peer IP addresses > **Key Principle**: Telemetry collects **operational metadata** (timing, counts, hashes) — never **sensitive content** (keys, balances, amounts, raw payloads). +> **See also**: [Securing the OTel Pipeline](./secure-OTel.md) covers transport-level protection for telemetry leaving the node — mTLS to the collector and validation of incoming peer trace context. Privacy controls in this section keep sensitive data out of spans; the security doc keeps the spans themselves out of untrusted hands. + --- ## 2.5 Context Propagation Design diff --git a/OpenTelemetryPlan/05-configuration-reference.md b/OpenTelemetryPlan/05-configuration-reference.md index 0c7ec5d6f4..9ba0a73ad1 100644 --- a/OpenTelemetryPlan/05-configuration-reference.md +++ b/OpenTelemetryPlan/05-configuration-reference.md @@ -405,6 +405,8 @@ endif() > **OTLP** = OpenTelemetry Protocol | **APM** = Application Performance Monitoring +> **Production hardening**: The configurations in this section are starting points. For production deployments where rippled ships telemetry across a network to a centrally-hosted collector, see [Securing the OTel Pipeline](./secure-OTel.md) for the required mTLS receiver config, NetworkPolicy, and peer trace-context validation. + ### 5.5.1 Development Configuration ```yaml diff --git a/OpenTelemetryPlan/08-appendix.md b/OpenTelemetryPlan/08-appendix.md index 49d0534dd0..41e3cbb47d 100644 --- a/OpenTelemetryPlan/08-appendix.md +++ b/OpenTelemetryPlan/08-appendix.md @@ -170,19 +170,20 @@ flowchart TB ### Plan Documents -| Document | Description | -| ---------------------------------------------------------------- | -------------------------------------------- | -| [OpenTelemetryPlan.md](./OpenTelemetryPlan.md) | Master overview and executive summary | -| [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) | Distributed tracing concepts and OTel primer | -| [01-architecture-analysis.md](./01-architecture-analysis.md) | xrpld architecture and trace points | -| [02-design-decisions.md](./02-design-decisions.md) | SDK selection, exporters, span conventions | -| [03-implementation-strategy.md](./03-implementation-strategy.md) | Directory structure, performance analysis | -| [04-code-samples.md](./04-code-samples.md) | C++ code examples for all components | -| [05-configuration-reference.md](./05-configuration-reference.md) | xrpld config, CMake, Collector configs | -| [06-implementation-phases.md](./06-implementation-phases.md) | Timeline, tasks, risks, success metrics | -| [07-observability-backends.md](./07-observability-backends.md) | Backend selection and architecture | -| [08-appendix.md](./08-appendix.md) | Glossary, references, version history | -| [presentation.md](./presentation.md) | Slide deck for OTel plan overview | +| Document | Description | +| ---------------------------------------------------------------- | -------------------------------------------------- | +| [OpenTelemetryPlan.md](./OpenTelemetryPlan.md) | Master overview and executive summary | +| [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) | Distributed tracing concepts and OTel primer | +| [01-architecture-analysis.md](./01-architecture-analysis.md) | xrpld architecture and trace points | +| [02-design-decisions.md](./02-design-decisions.md) | SDK selection, exporters, span conventions | +| [03-implementation-strategy.md](./03-implementation-strategy.md) | Directory structure, performance analysis | +| [04-code-samples.md](./04-code-samples.md) | C++ code examples for all components | +| [05-configuration-reference.md](./05-configuration-reference.md) | xrpld config, CMake, Collector configs | +| [06-implementation-phases.md](./06-implementation-phases.md) | Timeline, tasks, risks, success metrics | +| [07-observability-backends.md](./07-observability-backends.md) | Backend selection and architecture | +| [08-appendix.md](./08-appendix.md) | Glossary, references, version history | +| [secure-OTel.md](./secure-OTel.md) | Threat model and hardening (mTLS, peer validation) | +| [presentation.md](./presentation.md) | Slide deck for OTel plan overview | ### Task Lists diff --git a/OpenTelemetryPlan/OpenTelemetryPlan.md b/OpenTelemetryPlan/OpenTelemetryPlan.md index 1161b99015..154fca941f 100644 --- a/OpenTelemetryPlan/OpenTelemetryPlan.md +++ b/OpenTelemetryPlan/OpenTelemetryPlan.md @@ -54,6 +54,7 @@ flowchart TB phases["06-implementation-phases.md"] backends["07-observability-backends.md"] appendix["08-appendix.md"] + secure["secure-OTel.md"] poc["POC_taskList.md"] end @@ -70,6 +71,7 @@ flowchart TB config --> phases phases --> backends backends --> appendix + backends --> secure phases --> poc style overview fill:#1b5e20,stroke:#0d3d14,color:#fff,stroke-width:2px @@ -86,6 +88,7 @@ flowchart TB style phases fill:#4a148c,stroke:#2e0d57,color:#fff style backends fill:#4a148c,stroke:#2e0d57,color:#fff style appendix fill:#4a148c,stroke:#2e0d57,color:#fff + style secure fill:#4a148c,stroke:#2e0d57,color:#fff style poc fill:#4a148c,stroke:#2e0d57,color:#fff ``` @@ -106,6 +109,7 @@ flowchart TB | **6** | [Implementation Phases](./06-implementation-phases.md) | 5-phase timeline, tasks, risks, success metrics | | **7** | [Observability Backends](./07-observability-backends.md) | Backend selection guide and production architecture | | **8** | [Appendix](./08-appendix.md) | Glossary, references, version history | +| **Sec** | [Securing the OTel Pipeline](./secure-OTel.md) | Threat model and hardening (mTLS, peer trace-context validation) | | **POC** | [POC Task List](./POC_taskList.md) | Proof of concept tasks for RPC tracing end-to-end demo | --- @@ -220,6 +224,14 @@ The appendix contains a glossary of OpenTelemetry and xrpld-specific terms, refe --- +## Securing the OTel Pipeline + +Threat model and hardening guidance for production deployments where rippled nodes ship telemetry to a centrally-hosted collector across an untrusted network. Covers the two attack surfaces (collector ingress and peer trace-context spoofing) and the chosen defenses: mTLS as primary collector auth, NetworkPolicy as defense-in-depth, and source-side validation plus per-peer rate limiting for the `protocol::TraceContext` field on peer messages. + +➡️ **[View Securing the OTel Pipeline](./secure-OTel.md)** + +--- + ## POC Task List A step-by-step task list for building a minimal end-to-end proof of concept that demonstrates distributed tracing in xrpld. The POC scope is limited to RPC tracing — showing request traces flowing from xrpld through an OpenTelemetry Collector into Tempo, viewable in Grafana. diff --git a/OpenTelemetryPlan/secure-OTel.md b/OpenTelemetryPlan/secure-OTel.md new file mode 100644 index 0000000000..81bf30b182 --- /dev/null +++ b/OpenTelemetryPlan/secure-OTel.md @@ -0,0 +1,243 @@ +# Securing OpenTelemetry Against Trace Context Spoofing + +> **Part of**: [OpenTelemetry Implementation Plan](./OpenTelemetryPlan.md) — see also [Design Decisions § Privacy](./02-design-decisions.md#244-privacy--sensitive-data-policy) (what we don't collect) and [Configuration Reference § 5.5](./05-configuration-reference.md#55-opentelemetry-collector-configuration) (collector base config). + +Trace context spoofing (or poisoning) occurs when untrusted actors inject tampered or stale trace IDs into your system. If these requests are processed, the spans are appended to historical trace buckets, stretching trace durations, ruining p99 latency metrics, and breaking Grafana dashboards. + +This guide outlines two categories of defense: mitigating tampered contexts and locking down the OpenTelemetry (OTel) Collector to trusted clients only. + +--- + +## Part 1: Mitigating Tampered Trace Contexts + +### 1. Perimeter Defense: Strip Headers at the API Gateway + +The most effective way to prevent spoofing from external sources is to treat your API Gateway (Envoy, NGINX, AWS ALB) as a hard boundary. Strip incoming W3C tracing headers (`traceparent`, `tracestate`) from public traffic so the gateway is forced to generate a fresh, legitimate `trace_id`. + +**NGINX Example (Stripping Headers):** + +Code output + +File generated successfully. + +```nginx +server { + listen 80; + + location { + # Clear out untrusted incoming trace headers + proxy_set_header traceparent ""; + proxy_set_header tracestate ""; + + proxy_pass http://backend_service; + } +} +``` + +### **2. Timestamp-Anchored Trace IDs and OTTL Filtering** + +If you use a custom trace ID generator that embeds a timestamp in the first few bytes (like AWS X-Ray or UUIDv7), you can use the OTel Collector's OpenTelemetry Transform Language (OTTL) to detect anomalies. +**Collector Configuration (Conceptual OTTL Filter):** + +```yaml +processors: + filter/stale_traces: + error_mode: ignore + traces: + span: + # Example: Drop spans where the start time is significantly different + # from an expected parameter or embedded timestamp logic. + # Note: Standard W3C trace IDs do not contain timestamps by default. + - 'Keep out-of-bounds spans: time.sub(start_time, now()) > duration("1h")' +``` + +## **Part 2: Restricting Access to the OTel Collector** + +Locking down the Collector ensures that only authenticated, trusted clients can submit telemetry data. + +### **Approach A: Network Layer Security (Kubernetes Network Policies)** + +Ensure your Collector is not exposed to the public internet. If running in Kubernetes, use a NetworkPolicy to restrict ingress traffic to specific namespaces. +**Kubernetes NetworkPolicy Example:** + +```yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-internal-otel + namespace: observability +spec: + podSelector: + matchLabels: + app: opentelemetry-collector + policyTypes: + - Ingress + ingress: + - from: + - namespaceSelector: + matchLabels: + environment: production + ports: + - protocol: TCP + port: 4317 # gRPC + - protocol: TCP + port: 4318 # HTTP +``` + +### **Approach B: Transport Layer Security (Mutual TLS / mTLS)** + +Require clients to present a valid cryptographic certificate to connect to the Collector. +**Collector Configuration (mTLS):** + +```yaml +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + tls: + client_ca_file: /certs/client_ca.pem # CA that signs trusted client certs + cert_file: /certs/collector.pem + key_file: /certs/collector.key + auth_type: require_and_verify_client_cert # Rejects unauthorized clients +``` + +### **Approach C: Application Layer Authentication (Basic Auth Extension)** + +Use the Collector's extension system to require an API key or Basic Auth credentials. +**Collector Configuration (Basic Auth):** + +```yaml +extensions: + basicauth/collector: + htpasswd: + inline: | + # username:trusted-client, password:SecurePassword123 + trusted-client:$apr1$4v8p76o6$DMTX5Wv6uOmrFAZp2X1N1. + +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + auth: + authenticator: basicauth/collector + +processors: + batch: + +exporters: + otlp: + endpoint: my-backend-storage:4317 + +service: + extensions: [basicauth/collector] + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp] +``` + +**Client Setup (Environment Variables):** +Developers must pass the authentication header using the standard OTel SDK environment variables: + +```bash +# Base64 encoded "trusted-client:SecurePassword123" +export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Basic dHJ1c3RlZC1jbGllbnQ6U2VjdXJlUGFzc3dvcmQxMjM=" +``` + +--- + +Available routes to build on top of: https://github.com/XRPLF/rippled/pull/6425#discussion_r3234751995 + +--- + +# Analysis: Applying the Guide to rippled + +The guide above is written for HTTP-fronted web services. rippled is a P2P node daemon, so the threat model and the applicable defenses differ. This section captures how each approach maps to rippled and the chosen direction. + +## Threat Model + +rippled has **two distinct attack surfaces**, not one. The original guide conflates them under "trace context spoofing"; for rippled they need separate defenses. + +| Surface | Attacker | Vector | Defense | +| ------------------------------------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------- | +| **Collector ingress** (rippled → collector) | Anyone who can reach `4317`/`4318` on the collector host | Forged OTLP traffic, telemetry exfiltration, DoS on collector | mTLS + network policy | +| **Peer trace context** (peer → rippled) | Malicious peer in the XRPL overlay | Crafted `protocol::TraceContext` field inside peer protobuf messages (TMTransaction, consensus, etc.) — used to forge `trace_id`/`span_id`, pollute p99, attach spans to historical traces | Validate + rate-limit at the receive boundary | + +**Deployment context:** Across-network. rippled nodes (potentially run by external operators or in different DCs) ship telemetry to a centrally-hosted collector across an untrusted network. The collector is NOT on the same host or private VPC as every node. + +``` + ┌── peer (untrusted) ── TMTransaction{trace_context} ──▶ rippled + │ │ + │ [validate + rate-limit] + │ │ + │ ▼ + │ SpanGuard (clean) + │ │ + │ │ OTLP/gRPC + │ │ + mTLS + │ ▼ + └───────────────────────────────────────── [require_and_verify_client_cert] + OTel Collector + (in private subnet, NetPol) +``` + +## Part 1 Applicability — Peer Trace-Context Validation + +The guide's NGINX header stripping and OTTL stale-span filtering target HTTP gateways and post-hoc cleanup. Neither fits rippled directly: + +- **NGINX header stripping** — N/A. There is no HTTP gateway between peers and rippled; trace context arrives inside protobuf peer messages (`protocol::TraceContext`), not as W3C `traceparent` headers. See [src/xrpld/telemetry/PropagationHelpers.h](../src/xrpld/telemetry/PropagationHelpers.h). +- **OTTL stale-span filtering** — Weak fit. Post-hoc cleanup at the collector loses peer identity (you can't tell _which_ peer poisoned the trace). Validation at the receive site is stronger. + +**rippled-specific Part 1 mitigations:** + +1. **Validate extracted context at the boundary** in [src/xrpld/telemetry/ConsensusReceiveTracing.h](../src/xrpld/telemetry/ConsensusReceiveTracing.h) and any other peer-message receive site. Reject if `trace_id` is all-zero, wrong length, or fails W3C format checks. Treat invalid context as "no propagated context" — start a fresh span — rather than dropping the message. +2. **Per-peer sample rate limiting** so a hostile peer cannot flood the collector with spans bearing a fabricated `trace_id`. Use probabilistic sampling on the receive path keyed by peer identity. + +## Part 2 — Comparison of Collector Hardening Approaches + +Evaluated for the across-network deployment shape: + +| Approach | Across-network fit | Cost | Verdict | +| ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------- | ---------------------------------- | +| **A. NetworkPolicy / firewall** | Necessary baseline (don't expose `4317`/`4318` to the internet), but insufficient on its own when traffic genuinely crosses networks — you cannot NetworkPolicy the public internet. | Cheap. | **Defense-in-depth, not primary.** | +| **B. mTLS** | Strongest fit. Every rippled node holds a client cert; collector verifies with `require_and_verify_client_cert`. Encrypts in transit (raw OTLP over the internet leaks transaction patterns and validator identity). Compromised node = revoke one cert, no shared secret to rotate everywhere. | Cert issuance + rotation pipeline. | **Primary.** | +| **C. Basic Auth** | Worst shape for this topology. Single shared password across all rippled nodes — one leaked node config compromises the whole fleet. Doesn't encrypt; you'd need TLS underneath anyway, at which point you're 80% of the way to mTLS. | Cheap to set up, expensive to operate (rotation across N operators). | **Skip.** | + +## Decision + +**Primary defense:** mTLS (Approach B) on the collector's OTLP receivers, with `auth_type: require_and_verify_client_cert`. + +**Defense-in-depth:** NetworkPolicy / firewall rules (Approach A) so `4317`/`4318` are never reachable from outside the expected operator subnets even if mTLS were misconfigured. + +**Skipped:** Basic Auth (Approach C) — wrong shape for an across-network, multi-operator topology. + +**Plus rippled-specific Part 1 work:** trace-context validation and per-peer rate limiting at peer-message receive sites. + +## Decisions Made + +| Decision | Choice | Rationale | +| -------------------- | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Cert source for mTLS | **Reuse XRPL node identity key** | One identity per node, no separate PKI to operate. Fits XRPL's existing trust model; requires small CA tooling step to derive/sign the OTel client cert from the node key. | +| Part 1 scope | **Include in this spec** | Collector hardening and peer trace-context validation share one threat model. Coherent design doc; can still be split into multiple PRs at implementation. | +| Dev impact | **Production-only** | Local `docker/telemetry/docker-compose.yml` keeps `insecure: true` and no auth for fast iteration. Only production deployment manifests gain mTLS. Accepted risk: minor dev/prod drift, mitigated by integration tests against a TLS-enabled collector in CI. | + +## Out of Scope + +- NGINX/Envoy header stripping (no HTTP gateway in front of rippled-to-collector traffic). +- OTTL stale-span filtering at the collector (weaker than source validation; loses peer identity). +- Local development docker-compose hardening. +- Telemetry backend (Tempo) hardening — separate concern, downstream of the collector. + +## Next Step + +Write this up as a design doc with full sections covering: + +1. Threat model & architecture (this section, expanded) +2. Collector hardening — mTLS config, NetworkPolicy +3. Cert pipeline — deriving OTel client cert from XRPL node key +4. Peer trace-context validation — receive-site checks in `ConsensusReceiveTracing.h` +5. Per-peer span rate limiting +6. Testing & rollout diff --git a/cspell.config.yaml b/cspell.config.yaml index e7fade4431..1670d2d65b 100644 --- a/cspell.config.yaml +++ b/cspell.config.yaml @@ -103,6 +103,7 @@ words: - enabled - endmacro - exceptioned + - exfiltration - Falco - fcontext - finalizers @@ -118,6 +119,7 @@ words: - gpgkey - hotwallet - hicpp + - htpasswd - hwaddress - hwrap - ifndef From c6c019ed8bb0b85f5ff5d88fdea58b9f861d91b0 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 28 May 2026 15:55:25 +0100 Subject: [PATCH 8/9] addressed code review comments Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- OpenTelemetryPlan/Phase3_taskList.md | 6 +- .../provisioning/datasources/tempo.yaml | 2 +- include/xrpl/proto/xrpl.proto | 13 +++- include/xrpl/telemetry/SpanGuard.h | 22 +++--- include/xrpl/telemetry/SpanNames.h | 16 +++-- .../xrpl/telemetry/TraceContextPropagator.h | 8 +++ src/libxrpl/telemetry/SpanGuard.cpp | 22 +++--- src/xrpld/app/consensus/ConsensusSpanNames.h | 67 +++++++++++++++++++ src/xrpld/app/consensus/RCLConsensus.cpp | 9 +++ src/xrpld/app/misc/NetworkOPs.cpp | 15 +++-- src/xrpld/app/misc/TxSpanNames.h | 11 +++ src/xrpld/overlay/detail/PeerImp.cpp | 31 ++++++--- 12 files changed, 177 insertions(+), 45 deletions(-) create mode 100644 src/xrpld/app/consensus/ConsensusSpanNames.h diff --git a/OpenTelemetryPlan/Phase3_taskList.md b/OpenTelemetryPlan/Phase3_taskList.md index 9c5127120f..c2f607e1d8 100644 --- a/OpenTelemetryPlan/Phase3_taskList.md +++ b/OpenTelemetryPlan/Phase3_taskList.md @@ -89,7 +89,7 @@ - In `onMessage(TMTransaction)` / `handleTransaction()`: - Extract parent trace context from incoming `TMTransaction::trace_context` field (if present) - Create `tx.receive` span as child of extracted context (or new root if none) - - Set attributes: `xrpl.tx.hash`, `xrpl.peer.id`, `tx_status` + - Set attributes: `tx_hash`, `peer_id`, `tx_status` - On HashRouter suppression (duplicate): set `suppressed=true`, add `tx.duplicate` event - Wrap validation call with child span `tx.validate` - Wrap relay with `tx.relay` span @@ -121,7 +121,7 @@ - Edit `src/xrpld/app/misc/NetworkOPs.cpp`: - In `processTransaction()`: - Create `tx.process` span - - Set attributes: `xrpl.tx.hash`, `tx_type`, `local` (whether from RPC or peer) + - Set attributes: `tx_hash`, `tx_type`, `local` (whether from RPC or peer) - Record whether sync or async path is taken - In `doTransactionAsync()`: @@ -256,7 +256,7 @@ **What to do**: - Edit `src/xrpld/overlay/detail/PeerImp.cpp`: - - In the `tx.receive` span block (after existing `xrpl.peer.id` setAttribute call): + - In the `tx.receive` span block (after existing `peer_id` setAttribute call): - Add `peer_version` (string) — from `this->getVersion()` - Only set if `getVersion()` returns a non-empty string (avoid empty-string attributes) diff --git a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml index a09513b49d..0c5df2126f 100644 --- a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml +++ b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml @@ -109,7 +109,7 @@ datasources: type: dynamic # Phase 3: Transaction tracing filters - id: tx-hash - tag: xrpl.tx.hash + tag: tx_hash operator: "=" scope: span type: static diff --git a/include/xrpl/proto/xrpl.proto b/include/xrpl/proto/xrpl.proto index 56f4dafc80..1a578aa133 100644 --- a/include/xrpl/proto/xrpl.proto +++ b/include/xrpl/proto/xrpl.proto @@ -87,11 +87,22 @@ message TMPublicKey { // Trace context for OpenTelemetry distributed tracing across nodes. // Uses W3C Trace Context format internally. +// +// Field numbering note: this message is embedded as field 1001 on +// TMTransaction, TMProposeSet, and TMValidation. Field numbers >= 1000 +// are reserved for optional, observability-only additions that must not +// collide with protocol-semantic fields (which historically use 1-99). +// Older peers that do not understand field 1001 will simply ignore it +// per protobuf wire-format rules, preserving backwards compatibility. +// +// trace_state is reserved for future use (secure tracing pipeline, +// OpenTelemetryPlan/secure-OTel.md). It is currently neither populated +// on inject nor read on extract; consumers must not rely on it. message TraceContext { optional bytes trace_id = 1; // 16-byte trace identifier optional bytes span_id = 2; // 8-byte parent span identifier optional uint32 trace_flags = 3; // bit 0 = sampled - optional string trace_state = 4; // W3C tracestate header value + optional string trace_state = 4; // RESERVED — see TraceContext header note } enum TransactionStatus { diff --git a/include/xrpl/telemetry/SpanGuard.h b/include/xrpl/telemetry/SpanGuard.h index dc59bebbe3..d944e12406 100644 --- a/include/xrpl/telemetry/SpanGuard.h +++ b/include/xrpl/telemetry/SpanGuard.h @@ -275,10 +275,10 @@ public: */ static SpanGuard hashSpan( - TraceCategory cat, - std::string_view name, - std::uint8_t const* hashData, - std::size_t hashSize); + TraceCategory const cat, + std::string_view const name, + std::uint8_t const* const hashData, + std::size_t const hashSize); /** Create a hash-derived span with a remote parent. trace_id = hashData[0:16], parent span_id from protobuf context @@ -294,13 +294,13 @@ public: */ static SpanGuard hashSpan( - TraceCategory cat, - std::string_view name, - std::uint8_t const* hashData, - std::size_t hashSize, - std::uint8_t const* parentSpanId, - std::size_t parentSpanSize, - std::uint8_t traceFlags); + TraceCategory const cat, + std::string_view const name, + std::uint8_t const* const hashData, + std::size_t const hashSize, + std::uint8_t const* const parentSpanId, + std::size_t const parentSpanSize, + std::uint8_t const traceFlags); // --- Context capture ----------------------------------------------- diff --git a/include/xrpl/telemetry/SpanNames.h b/include/xrpl/telemetry/SpanNames.h index fc7f98c86a..092bdc5c6d 100644 --- a/include/xrpl/telemetry/SpanNames.h +++ b/include/xrpl/telemetry/SpanNames.h @@ -109,11 +109,17 @@ inline constexpr auto networkId = join(join(seg::xrpl, seg::network), makeStr("i inline constexpr auto networkType = join(join(seg::xrpl, seg::network), makeStr("type")); inline constexpr auto linkType = makeStr("link_type"); -/// Canonical shared attrs (rule 5 — kept xrpl..* form). -/// Defined once here, aliased by domain-specific headers. -inline constexpr auto txHash = join(join(seg::xrpl, seg::tx), makeStr("hash")); -inline constexpr auto peerId = join(join(seg::xrpl, seg::peer), makeStr("id")); -inline constexpr auto ledgerSeq = join(join(seg::xrpl, seg::ledger), makeStr("seq")); +/// Canonical shared attrs (rule 5 — _ underscore form). +/// +/// Per the naming convention header note: shared cross-span attribute +/// keys use the underscore form, reserving the dotted xrpl.. +/// form for resource attributes set on the OTel resource at startup. +/// Defined once here, aliased by domain-specific headers. These are +/// literal underscore-joined names, not dot-joined via `join()`, since +/// `join()` always inserts `.` between its arguments. +inline constexpr auto txHash = makeStr("tx_hash"); +inline constexpr auto peerId = makeStr("peer_id"); +inline constexpr auto ledgerSeq = makeStr("ledger_seq"); } // namespace attr // ===== Shared attribute values ============================================= diff --git a/include/xrpl/telemetry/TraceContextPropagator.h b/include/xrpl/telemetry/TraceContextPropagator.h index d0fb7d576d..3f400860fb 100644 --- a/include/xrpl/telemetry/TraceContextPropagator.h +++ b/include/xrpl/telemetry/TraceContextPropagator.h @@ -91,6 +91,14 @@ injectToProtobuf(opentelemetry::context::Context const& ctx, protocol::TraceCont // Serialize flags proto.set_trace_flags(spanCtx.trace_flags().flags()); + + // TODO(observability/secure-OTel): the protobuf TraceContext message + // also carries `trace_state` (field 4), which is currently neither + // populated here nor read by extractFromProtobuf above. The field is + // reserved for the secure tracing pipeline outlined in + // OpenTelemetryPlan/secure-OTel.md, where an authenticated token in + // tracestate will let receivers reject spoofed/poisoned trace context. + // Wire trace_state through inject/extract once the consumer lands. } } // namespace telemetry diff --git a/src/libxrpl/telemetry/SpanGuard.cpp b/src/libxrpl/telemetry/SpanGuard.cpp index 6343f91b3d..f24f723c90 100644 --- a/src/libxrpl/telemetry/SpanGuard.cpp +++ b/src/libxrpl/telemetry/SpanGuard.cpp @@ -264,10 +264,10 @@ SpanGuard::linkedSpan(std::string_view name, SpanContext const& linkCtx) SpanGuard SpanGuard::hashSpan( - TraceCategory cat, - std::string_view name, - std::uint8_t const* hashData, - std::size_t hashSize) + TraceCategory const cat, + std::string_view const name, + std::uint8_t const* const hashData, + std::size_t const hashSize) { if (hashSize < 16) return {}; @@ -295,13 +295,13 @@ SpanGuard::hashSpan( SpanGuard SpanGuard::hashSpan( - TraceCategory cat, - std::string_view name, - std::uint8_t const* hashData, - std::size_t hashSize, - std::uint8_t const* parentSpanId, - std::size_t parentSpanSize, - std::uint8_t traceFlags) + TraceCategory const cat, + std::string_view const name, + std::uint8_t const* const hashData, + std::size_t const hashSize, + std::uint8_t const* const parentSpanId, + std::size_t const parentSpanSize, + std::uint8_t const traceFlags) { if (hashSize < 16 || parentSpanSize != 8) return {}; diff --git a/src/xrpld/app/consensus/ConsensusSpanNames.h b/src/xrpld/app/consensus/ConsensusSpanNames.h new file mode 100644 index 0000000000..f400adc7f6 --- /dev/null +++ b/src/xrpld/app/consensus/ConsensusSpanNames.h @@ -0,0 +1,67 @@ +#pragma once + +/** Compile-time span name and attribute constants for consensus tracing. + * + * Used by PeerImp (overlay) and RCLConsensus (consensus) for proposal + * and validation lifecycle spans. Built on StaticStr/join() from + * SpanNames.h and follows the rule-5 underscore form for shared + * cross-span attributes (e.g. `consensus_round`, `ledger_seq`). + * + * Phase 3 introduces the receive-side surface used by PeerImp. + * Phase 4 will extend this with the proposer/validator-side spans + * (`consensus.proposal.send`, `consensus.validation.send`, round + * bookkeeping, etc.). + * + * Span hierarchy (cross-node propagation): + * + * Node A (sender) Node B (receiver) + * +----------------------------+ +-------------------------------+ + * | consensus.proposal/...send | proto | consensus.proposal/...receive | + * | inject trace context | -----> | proposalReceiveSpan() / | + * | (RCLConsensus broadcast) | t_ctx | validationReceiveSpan() | + * +----------------------------+ +-------------------------------+ + */ + +#include + +namespace xrpl::telemetry::consensus_span { + +// ===== Span prefixes ======================================================= + +namespace prefix { +/// "consensus" — root prefix for consensus lifecycle spans. +inline constexpr auto consensus = seg::consensus; +/// "consensus.proposal" — proposal sub-tree. +inline constexpr auto proposal = join(consensus, makeStr("proposal")); +/// "consensus.validation" — validation sub-tree. +inline constexpr auto validation = join(consensus, makeStr("validation")); +} // namespace prefix + +// ===== Span operation suffixes ============================================= + +namespace op { +inline constexpr auto receive = makeStr("receive"); +inline constexpr auto send = makeStr("send"); +} // namespace op + +// ===== Full span names ===================================================== + +inline constexpr auto proposalReceive = join(prefix::proposal, op::receive); +inline constexpr auto validationReceive = join(prefix::validation, op::receive); + +// ===== Attribute keys ====================================================== + +namespace attr { +/// Canonical shared constants (defined in SpanNames.h). +using ::xrpl::telemetry::attr::ledgerSeq; + +/// "trusted" — bare field; whether the proposing/validating node is +/// in our UNL. Used only on consensus spans, no cross-domain collision. +inline constexpr auto trusted = makeStr("trusted"); + +/// "consensus_round" — propose-sequence within a consensus round +/// (rule-5 underscore form, shared across consensus spans). +inline constexpr auto round = makeStr("consensus_round"); +} // namespace attr + +} // namespace xrpl::telemetry::consensus_span diff --git a/src/xrpld/app/consensus/RCLConsensus.cpp b/src/xrpld/app/consensus/RCLConsensus.cpp index eb040f231f..4f34953423 100644 --- a/src/xrpld/app/consensus/RCLConsensus.cpp +++ b/src/xrpld/app/consensus/RCLConsensus.cpp @@ -898,6 +898,15 @@ RCLConsensus::Adaptor::validate(RCLCxLedger const& ledger, RCLTxSet const& txns, val.set_validation(serialized.data(), serialized.size()); // Inject the current thread's active span context so receiving // peers can link their validation.receive span as a child. + // + // TODO(observability/secure-OTel): the trace_context appended below is + // outside the cryptographic signature on `serialized` and is therefore + // unauthenticated. Receivers cannot prove it was not tampered with by + // a relay. A signed trace context (either folded into the validation + // payload or carried by an authenticated trace_state token) is tracked + // as a follow-up — see PR #6425 discussion r3317273388 and + // OpenTelemetryPlan/secure-OTel.md. Until then, downstream consumers + // must treat the validation trace_context as advisory only. #ifdef XRPL_ENABLE_TELEMETRY { auto ctx = opentelemetry::context::RuntimeContext::GetCurrent(); diff --git a/src/xrpld/app/misc/NetworkOPs.cpp b/src/xrpld/app/misc/NetworkOPs.cpp index a956f3413f..76237ee383 100644 --- a/src/xrpld/app/misc/NetworkOPs.cpp +++ b/src/xrpld/app/misc/NetworkOPs.cpp @@ -388,9 +388,15 @@ public: * @param transaction Transaction object. * @param bUnlimited Whether a privileged client connection submitted it. * @param failType fail_hard setting from transaction submission. + * @param span Optional tx.process span to keep alive across the + * batch boundary so its context propagates to peers. */ void - doTransactionSync(std::shared_ptr transaction, bool bUnlimited, FailHard failType); + doTransactionSync( + std::shared_ptr transaction, + bool bUnlimited, + FailHard failType, + std::shared_ptr span = nullptr); /** * For transactions not submitted by a locally connected client, fire and @@ -1337,7 +1343,7 @@ NetworkOPsImp::processTransaction( if (bLocal) { span->setAttribute(tx_span::attr::path, tx_span::val::sync); - doTransactionSync(transaction, bUnlimited, failType); + doTransactionSync(transaction, bUnlimited, failType, std::move(span)); } else { @@ -1374,13 +1380,14 @@ void NetworkOPsImp::doTransactionSync( std::shared_ptr transaction, bool bUnlimited, - FailHard failType) + FailHard failType, + std::shared_ptr span) { std::unique_lock lock(mMutex); if (!transaction->getApplying()) { - mTransactions.emplace_back(transaction, bUnlimited, true, failType); + mTransactions.emplace_back(transaction, bUnlimited, true, failType, std::move(span)); transaction->setApplying(); } diff --git a/src/xrpld/app/misc/TxSpanNames.h b/src/xrpld/app/misc/TxSpanNames.h index 964246fcd6..965b15ddf4 100644 --- a/src/xrpld/app/misc/TxSpanNames.h +++ b/src/xrpld/app/misc/TxSpanNames.h @@ -63,6 +63,17 @@ namespace val { inline constexpr auto sync = makeStr("sync"); inline constexpr auto async = makeStr("async"); inline constexpr auto knownBad = makeStr("known_bad"); +/// Transaction was suppressed via HashRouter (duplicate, not flagged bad). +inline constexpr auto suppressed = makeStr("suppressed"); +/// Transaction was rejected because it carried tfInnerBatchTxn, which +/// must never appear in network-relayed traffic. +inline constexpr auto rejectedInnerBatch = makeStr("rejected_inner_batch"); +/// Transaction was dropped because the validated ledger is too old to +/// confidently apply new transactions (server is out of sync). +inline constexpr auto droppedNoSync = makeStr("dropped_no_sync"); +/// Transaction was dropped because the local job queue for jtTRANSACTION +/// is at MAX_TRANSACTIONS — backpressure on the receive side. +inline constexpr auto droppedQueueFull = makeStr("dropped_queue_full"); } // namespace val } // namespace xrpl::telemetry::tx_span diff --git a/src/xrpld/overlay/detail/PeerImp.cpp b/src/xrpld/overlay/detail/PeerImp.cpp index f281246b1c..eababb8b5a 100644 --- a/src/xrpld/overlay/detail/PeerImp.cpp +++ b/src/xrpld/overlay/detail/PeerImp.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -1474,6 +1475,7 @@ PeerImp::handleTransaction( */ if (stx->isFlag(tfInnerBatchTxn)) { + span->setAttribute(tx_span::attr::txStatus, tx_span::val::rejectedInnerBatch); JLOG(p_journal_.warn()) << "Ignoring Network relayed Tx containing " "tfInnerBatchTxn (handleTransaction)."; fee_.update(Resource::feeModerateBurdenPeer, "inner batch txn"); @@ -1494,12 +1496,20 @@ PeerImp::handleTransaction( fee_.update(Resource::feeUselessData, "known bad"); JLOG(p_journal_.debug()) << "Ignoring known bad tx " << txID; } - - // Erase only if the server has seen this tx. If the server has not - // seen this tx then the tx could not has been queued for this peer. - else if (eraseTxQueue && txReduceRelayEnabled()) + else { - removeTxQueue(txID); + // Recently-seen but not flagged bad — this is the plain + // duplicate-suppression path. Mark it explicitly so the + // span never exits as "new". + span->setAttribute(tx_span::attr::txStatus, tx_span::val::suppressed); + + // Erase only if the server has seen this tx. If the server + // has not seen this tx then the tx could not have been + // queued for this peer. + if (eraseTxQueue && txReduceRelayEnabled()) + { + removeTxQueue(txID); + } } overlay_.reportInboundTraffic( @@ -1532,10 +1542,12 @@ PeerImp::handleTransaction( if (app_.getLedgerMaster().getValidatedLedgerAge() > 4min) { + span->setAttribute(tx_span::attr::txStatus, tx_span::val::droppedNoSync); JLOG(p_journal_.trace()) << "No new transactions until synchronized"; } else if (app_.getJobQueue().getJobCount(jtTRANSACTION) > app_.config().MAX_TRANSACTIONS) { + span->setAttribute(tx_span::attr::txStatus, tx_span::val::droppedQueueFull); overlay_.incJqTransOverflow(); JLOG(p_journal_.info()) << "Transaction queue is full"; } @@ -1967,8 +1979,9 @@ PeerImp::onMessage(std::shared_ptr const& m) // Create a receive span that links to the sender's trace context // (if propagated). shared_ptr keeps it alive across the job boundary. auto span = std::make_shared(telemetry::proposalReceiveSpan(set)); - span->setAttribute("xrpl.consensus.trusted", isTrusted); - span->setAttribute("xrpl.consensus.round", static_cast(set.proposeseq())); + span->setAttribute(telemetry::consensus_span::attr::trusted, isTrusted); + span->setAttribute( + telemetry::consensus_span::attr::round, static_cast(set.proposeseq())); std::weak_ptr const weak = shared_from_this(); app_.getJobQueue().addJob( @@ -2552,11 +2565,11 @@ PeerImp::onMessage(std::shared_ptr const& m) // Create a receive span that links to the sender's trace context // (if propagated). shared_ptr keeps it alive across the job boundary. auto span = std::make_shared(telemetry::validationReceiveSpan(*m)); - span->setAttribute("xrpl.consensus.trusted", isTrusted); + span->setAttribute(telemetry::consensus_span::attr::trusted, isTrusted); if (val->isFieldPresent(sfLedgerSequence)) { span->setAttribute( - "xrpl.consensus.ledger.seq", + telemetry::consensus_span::attr::ledgerSeq, static_cast(val->getFieldU32(sfLedgerSequence))); } From 954223958f3cc94a6f2f69141bf45c0f08893151 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Thu, 28 May 2026 16:07:34 +0100 Subject: [PATCH 9/9] renames Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- OpenTelemetryPlan/secure-OTel.md | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/OpenTelemetryPlan/secure-OTel.md b/OpenTelemetryPlan/secure-OTel.md index 81bf30b182..ad76b0b3b5 100644 --- a/OpenTelemetryPlan/secure-OTel.md +++ b/OpenTelemetryPlan/secure-OTel.md @@ -153,23 +153,23 @@ Available routes to build on top of: https://github.com/XRPLF/rippled/pull/6425# --- -# Analysis: Applying the Guide to rippled +# Analysis: Applying the Guide to xrpld -The guide above is written for HTTP-fronted web services. rippled is a P2P node daemon, so the threat model and the applicable defenses differ. This section captures how each approach maps to rippled and the chosen direction. +The guide above is written for HTTP-fronted web services. xrpld is a P2P node daemon, so the threat model and the applicable defenses differ. This section captures how each approach maps to xrpld and the chosen direction. ## Threat Model -rippled has **two distinct attack surfaces**, not one. The original guide conflates them under "trace context spoofing"; for rippled they need separate defenses. +xrpld has **two distinct attack surfaces**, not one. The original guide conflates them under "trace context spoofing"; for xrpld they need separate defenses. -| Surface | Attacker | Vector | Defense | -| ------------------------------------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------- | -| **Collector ingress** (rippled → collector) | Anyone who can reach `4317`/`4318` on the collector host | Forged OTLP traffic, telemetry exfiltration, DoS on collector | mTLS + network policy | -| **Peer trace context** (peer → rippled) | Malicious peer in the XRPL overlay | Crafted `protocol::TraceContext` field inside peer protobuf messages (TMTransaction, consensus, etc.) — used to forge `trace_id`/`span_id`, pollute p99, attach spans to historical traces | Validate + rate-limit at the receive boundary | +| Surface | Attacker | Vector | Defense | +| ----------------------------------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------- | +| **Collector ingress** (xrpld → collector) | Anyone who can reach `4317`/`4318` on the collector host | Forged OTLP traffic, telemetry exfiltration, DoS on collector | mTLS + network policy | +| **Peer trace context** (peer → xrpld) | Malicious peer in the XRPL overlay | Crafted `protocol::TraceContext` field inside peer protobuf messages (TMTransaction, consensus, etc.) — used to forge `trace_id`/`span_id`, pollute p99, attach spans to historical traces | Validate + rate-limit at the receive boundary | -**Deployment context:** Across-network. rippled nodes (potentially run by external operators or in different DCs) ship telemetry to a centrally-hosted collector across an untrusted network. The collector is NOT on the same host or private VPC as every node. +**Deployment context:** Across-network. xrpld nodes (potentially run by external operators or in different DCs) ship telemetry to a centrally-hosted collector across an untrusted network. The collector is NOT on the same host or private VPC as every node. ``` - ┌── peer (untrusted) ── TMTransaction{trace_context} ──▶ rippled + ┌── peer (untrusted) ── TMTransaction{trace_context} ──▶ xrpld │ │ │ [validate + rate-limit] │ │ @@ -186,12 +186,12 @@ rippled has **two distinct attack surfaces**, not one. The original guide confla ## Part 1 Applicability — Peer Trace-Context Validation -The guide's NGINX header stripping and OTTL stale-span filtering target HTTP gateways and post-hoc cleanup. Neither fits rippled directly: +The guide's NGINX header stripping and OTTL stale-span filtering target HTTP gateways and post-hoc cleanup. Neither fits xrpld directly: -- **NGINX header stripping** — N/A. There is no HTTP gateway between peers and rippled; trace context arrives inside protobuf peer messages (`protocol::TraceContext`), not as W3C `traceparent` headers. See [src/xrpld/telemetry/PropagationHelpers.h](../src/xrpld/telemetry/PropagationHelpers.h). +- **NGINX header stripping** — N/A. There is no HTTP gateway between peers and xrpld; trace context arrives inside protobuf peer messages (`protocol::TraceContext`), not as W3C `traceparent` headers. See [src/xrpld/telemetry/PropagationHelpers.h](../src/xrpld/telemetry/PropagationHelpers.h). - **OTTL stale-span filtering** — Weak fit. Post-hoc cleanup at the collector loses peer identity (you can't tell _which_ peer poisoned the trace). Validation at the receive site is stronger. -**rippled-specific Part 1 mitigations:** +**xrpld-specific Part 1 mitigations:** 1. **Validate extracted context at the boundary** in [src/xrpld/telemetry/ConsensusReceiveTracing.h](../src/xrpld/telemetry/ConsensusReceiveTracing.h) and any other peer-message receive site. Reject if `trace_id` is all-zero, wrong length, or fails W3C format checks. Treat invalid context as "no propagated context" — start a fresh span — rather than dropping the message. 2. **Per-peer sample rate limiting** so a hostile peer cannot flood the collector with spans bearing a fabricated `trace_id`. Use probabilistic sampling on the receive path keyed by peer identity. @@ -200,11 +200,11 @@ The guide's NGINX header stripping and OTTL stale-span filtering target HTTP gat Evaluated for the across-network deployment shape: -| Approach | Across-network fit | Cost | Verdict | -| ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------- | ---------------------------------- | -| **A. NetworkPolicy / firewall** | Necessary baseline (don't expose `4317`/`4318` to the internet), but insufficient on its own when traffic genuinely crosses networks — you cannot NetworkPolicy the public internet. | Cheap. | **Defense-in-depth, not primary.** | -| **B. mTLS** | Strongest fit. Every rippled node holds a client cert; collector verifies with `require_and_verify_client_cert`. Encrypts in transit (raw OTLP over the internet leaks transaction patterns and validator identity). Compromised node = revoke one cert, no shared secret to rotate everywhere. | Cert issuance + rotation pipeline. | **Primary.** | -| **C. Basic Auth** | Worst shape for this topology. Single shared password across all rippled nodes — one leaked node config compromises the whole fleet. Doesn't encrypt; you'd need TLS underneath anyway, at which point you're 80% of the way to mTLS. | Cheap to set up, expensive to operate (rotation across N operators). | **Skip.** | +| Approach | Across-network fit | Cost | Verdict | +| ------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------- | ---------------------------------- | +| **A. NetworkPolicy / firewall** | Necessary baseline (don't expose `4317`/`4318` to the internet), but insufficient on its own when traffic genuinely crosses networks — you cannot NetworkPolicy the public internet. | Cheap. | **Defense-in-depth, not primary.** | +| **B. mTLS** | Strongest fit. Every xrpld node holds a client cert; collector verifies with `require_and_verify_client_cert`. Encrypts in transit (raw OTLP over the internet leaks transaction patterns and validator identity). Compromised node = revoke one cert, no shared secret to rotate everywhere. | Cert issuance + rotation pipeline. | **Primary.** | +| **C. Basic Auth** | Worst shape for this topology. Single shared password across all xrpld nodes — one leaked node config compromises the whole fleet. Doesn't encrypt; you'd need TLS underneath anyway, at which point you're 80% of the way to mTLS. | Cheap to set up, expensive to operate (rotation across N operators). | **Skip.** | ## Decision @@ -214,7 +214,7 @@ Evaluated for the across-network deployment shape: **Skipped:** Basic Auth (Approach C) — wrong shape for an across-network, multi-operator topology. -**Plus rippled-specific Part 1 work:** trace-context validation and per-peer rate limiting at peer-message receive sites. +**Plus xrpld-specific Part 1 work:** trace-context validation and per-peer rate limiting at peer-message receive sites. ## Decisions Made @@ -226,7 +226,7 @@ Evaluated for the across-network deployment shape: ## Out of Scope -- NGINX/Envoy header stripping (no HTTP gateway in front of rippled-to-collector traffic). +- NGINX/Envoy header stripping (no HTTP gateway in front of xrpld-to-collector traffic). - OTTL stale-span filtering at the collector (weaker than source validation; loses peer identity). - Local development docker-compose hardening. - Telemetry backend (Tempo) hardening — separate concern, downstream of the collector.