From fa71280795f5776626ffe706e6d1fe93d65eb840 Mon Sep 17 00:00:00 2001
From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
Date: Tue, 9 Jun 2026 18:45:52 +0100
Subject: [PATCH] removed head sampling ratio from config

Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
---
 cfg/xrpld-example.cfg                     | 15 +++--
 include/xrpl/telemetry/SpanGuard.h        | 31 ++++++----
 include/xrpl/telemetry/Telemetry.h        | 15 +++--
 src/libxrpl/telemetry/Telemetry.cpp       | 12 +++-
 src/libxrpl/telemetry/TelemetryConfig.cpp | 73 +++++++++++++++++------
 5 files changed, 99 insertions(+), 47 deletions(-)

diff --git a/cfg/xrpld-example.cfg b/cfg/xrpld-example.cfg
index 34fa023e9e..f86074ae66 100644
--- a/cfg/xrpld-example.cfg
+++ b/cfg/xrpld-example.cfg
@@ -1648,14 +1648,13 @@ validators.txt
 #   protobuf-encoded HTTP POST requests to this URL.
 #   Default: http://localhost:4318/v1/traces.
 #
-# sampling_ratio=1.0
-#
-#   Head-based sampling ratio using TraceIdRatioBasedSampler. The decision
-#   to record or drop a trace is made at span creation time, before the
-#   span starts, based on the trace ID. Values in [0.0, 1.0].
-#   1.0 = trace everything, 0.1 = sample ~10% of traces. Default: 1.0.
-#   For tail-based (post-hoc) filtering — where you decide to drop a span
-#   after inspecting its content — use SpanGuard::discard() in code.
+#   Head sampling is intentionally fixed at 1.0 (sample everything) and is
+#   not configurable. A per-node sampling ratio would let nodes make
+#   divergent keep/drop decisions for the same distributed trace, producing
+#   broken/partial traces. A ParentBasedSampler ensures spans inheriting a
+#   remote parent honor the upstream decision. Reduce volume at the collector
+#   via tail sampling instead; for node-local post-hoc dropping use
+#   SpanGuard::discard() in code.
 #
 # trace_rpc=1
 #
diff --git a/include/xrpl/telemetry/SpanGuard.h b/include/xrpl/telemetry/SpanGuard.h
index 8ff955615c..d5e3a5781f 100644
--- a/include/xrpl/telemetry/SpanGuard.h
+++ b/include/xrpl/telemetry/SpanGuard.h
@@ -42,23 +42,27 @@
 
     Usage examples:
 
+    Span names and attribute keys come from per-module `*SpanNames.h`
+    headers (e.g. RpcSpanNames.h, TxSpanNames.h) as typed compile-time
+    constants — never raw string literals — so the naming spec is
+    enforced at the call site and dashboards stay in sync.
+
     1. Basic RPC tracing (factory method with category):
     @code
-        // Define prefix at class level:
-        static constexpr std::string_view spanPrefix_ = "rpc.command";
+        #include <xrpld/rpc/detail/RpcSpanNames.h>
+        using namespace xrpl::telemetry;
 
-        // At the call site:
         auto span = SpanGuard::span(
-            TraceCategory::Rpc, spanPrefix_, "submit");
-        span.setAttribute("xrpl.rpc.command", "submit");
-        span.setAttribute("xrpl.rpc.status", "success");
+            TraceCategory::Rpc, rpc_span::prefix::command, "submit");
+        span.setAttribute(rpc_span::attr::command, "submit");
+        span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success);
         // span ended automatically on scope exit
     @endcode
 
     2. Error recording:
     @code
         auto span = SpanGuard::span(
-            TraceCategory::Rpc, "rpc.command", "submit");
+            TraceCategory::Rpc, rpc_span::prefix::command, "submit");
         try {
             doWork();
             span.setOk();
@@ -69,29 +73,32 @@
 
     3. Cross-thread context propagation:
     @code
+        #include <xrpld/consensus/ConsensusSpanNames.h>
+        using namespace xrpl::telemetry;
+
         // Thread A: create span and capture context
         auto span = SpanGuard::span(
-            TraceCategory::Consensus, "consensus", "round");
+            TraceCategory::Consensus, seg::consensus, consensus::span::op::round);
         auto ctx = span.captureContext();
 
         // Thread B: create child with captured context
-        auto child = SpanGuard::childSpan("consensus.accept", ctx);
+        auto child = SpanGuard::childSpan(consensus::span::accept, ctx);
     @endcode
 
     4. Conditional check (rarely needed — methods are no-ops on null):
     @code
         auto span = SpanGuard::span(
-            TraceCategory::Rpc, "rpc", "request");
+            TraceCategory::Rpc, rpc_span::prefix::rpc, rpc_span::op::httpRequest);
         if (span) {
             // expensive attribute computation only when active
-            span.setAttribute("xrpl.rpc.payload_size", computeSize());
+            span.setAttribute(rpc_span::attr::requestPayloadSize, computeSize());
         }
     @endcode
 
     5. Tail-based filtering via discard():
     @code
         auto span = SpanGuard::span(
-            TraceCategory::Transactions, "tx", "process");
+            TraceCategory::Transactions, tx_span::prefix::tx, tx_span::op::process);
         auto result = preflight(tx);
         if (result != tesSUCCESS) {
             span.discard();  // drop span, never exported
diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h
index 4ecd6e76a2..c6f92f4da8 100644
--- a/include/xrpl/telemetry/Telemetry.h
+++ b/include/xrpl/telemetry/Telemetry.h
@@ -150,13 +150,16 @@ public:
         /** Path to a CA certificate bundle for TLS verification. */
         std::string tlsCertPath;
 
-        /** Head-based sampling ratio in [0.0, 1.0]. 1.0 = trace everything.
-            This is a head-based (pre-decision) sampler using
-            TraceIdRatioBasedSampler — the decision to record or drop a
-            trace is made before the root span starts. For post-hoc
-            (tail-based) filtering, see SpanGuard::discard().
+        /** Head-based sampling ratio. Intentionally fixed at 1.0 (sample
+            everything) and NOT read from config. A per-node ratio would let
+            nodes make divergent keep/drop decisions for the same distributed
+            trace, producing broken/partial traces. The ratio sampler is wrapped
+            in a ParentBasedSampler (see Telemetry.cpp) so spans inheriting a
+            remote parent honor the upstream sampled flag. Volume reduction is
+            delegated to the collector's tail sampling; for node-local post-hoc
+            dropping see SpanGuard::discard().
         */
-        double samplingRatio = 1.0;
+        double const samplingRatio = 1.0;
 
         /** Maximum number of spans per batch export. */
         std::uint32_t batchSize = 512;
diff --git a/src/libxrpl/telemetry/Telemetry.cpp b/src/libxrpl/telemetry/Telemetry.cpp
index 2721d749a6..c4425bda84 100644
--- a/src/libxrpl/telemetry/Telemetry.cpp
+++ b/src/libxrpl/telemetry/Telemetry.cpp
@@ -30,6 +30,7 @@
 #include <opentelemetry/sdk/trace/batch_span_processor_options.h>
 #include <opentelemetry/sdk/trace/processor.h>
 #include <opentelemetry/sdk/trace/sampler.h>
+#include <opentelemetry/sdk/trace/samplers/parent_factory.h>
 #include <opentelemetry/sdk/trace/samplers/trace_id_ratio.h>
 #include <opentelemetry/sdk/trace/tracer_provider.h>
 #include <opentelemetry/sdk/trace/tracer_provider_factory.h>
@@ -299,8 +300,15 @@ public:
             {"xrpl.network.type", setup_.networkType},
         });
 
-        // Configure sampler
-        auto sampler = std::make_unique<trace_sdk::TraceIdRatioBasedSampler>(setup_.samplingRatio);
+        // Configure sampler. Head sampling is fixed at 1.0 (sample everything);
+        // setup_.samplingRatio is not config-driven. Wrap the ratio sampler in a
+        // ParentBasedSampler so spans with a remote parent honor the upstream
+        // sampled flag — this keeps keep/drop decisions coherent for a single
+        // distributed trace spanning multiple nodes. Volume reduction is left to
+        // the collector's tail sampling.
+        auto rootSampler =
+            std::make_shared<trace_sdk::TraceIdRatioBasedSampler>(setup_.samplingRatio);
+        auto sampler = trace_sdk::ParentBasedSamplerFactory::Create(std::move(rootSampler));
 
         // Create TracerProvider
         sdkProvider_ = trace_sdk::TracerProviderFactory::Create(
diff --git a/src/libxrpl/telemetry/TelemetryConfig.cpp b/src/libxrpl/telemetry/TelemetryConfig.cpp
index 39f0206ce1..d90e1df05d 100644
--- a/src/libxrpl/telemetry/TelemetryConfig.cpp
+++ b/src/libxrpl/telemetry/TelemetryConfig.cpp
@@ -8,9 +8,10 @@
 */
 
 #include <xrpl/basics/BasicConfig.h>
+#include <xrpl/protocol/SystemParameters.h>
 #include <xrpl/telemetry/Telemetry.h>
 
-#include <algorithm>
+#include <chrono>
 #include <cstdint>
 #include <string>
 
@@ -18,6 +19,38 @@ namespace xrpl::telemetry {
 
 namespace {
 
+/** Config key names for the [telemetry] section.
+
+    Each must match the corresponding option documented in
+    cfg/xrpld-example.cfg verbatim. Defined as `char const*` so they
+    pass to Section::valueOr() (which takes `std::string const&`)
+    without an explicit conversion, exactly as a literal would.
+*/
+namespace key {
+constexpr char const* enabled = "enabled";
+constexpr char const* serviceName = "service_name";
+constexpr char const* serviceInstanceId = "service_instance_id";
+constexpr char const* endpoint = "endpoint";
+constexpr char const* useTls = "use_tls";
+constexpr char const* tlsCaCert = "tls_ca_cert";
+constexpr char const* batchSize = "batch_size";
+constexpr char const* batchDelayMs = "batch_delay_ms";
+constexpr char const* maxQueueSize = "max_queue_size";
+constexpr char const* traceTransactions = "trace_transactions";
+constexpr char const* traceConsensus = "trace_consensus";
+constexpr char const* traceRpc = "trace_rpc";
+constexpr char const* tracePeer = "trace_peer";
+constexpr char const* traceLedger = "trace_ledger";
+}  // namespace key
+
+/** Default values applied when a key is absent from the config. */
+namespace dflt {
+constexpr char const* endpoint = "http://localhost:4318/v1/traces";
+constexpr std::uint32_t batchSize = 512u;
+constexpr std::uint32_t batchDelayMs = 5000u;
+constexpr std::uint32_t maxQueueSize = 2048u;
+}  // namespace dflt
+
 /** Derive a human-readable network type label from the numeric network ID.
     @param networkId  The network identifier from [network_id] config.
     @return "mainnet", "testnet", "devnet", or "unknown" for other values.
@@ -49,33 +82,35 @@ setupTelemetry(
 {
     Telemetry::Setup setup;
 
-    setup.enabled = section.valueOr<int>("enabled", 0) != 0;
-    setup.serviceName = section.valueOr<std::string>("service_name", "xrpld");
+    setup.enabled = section.valueOr<int>(key::enabled, 0) != 0;
+    setup.serviceName = section.valueOr<std::string>(key::serviceName, systemName());
     setup.serviceVersion = version;
-    setup.serviceInstanceId = section.valueOr<std::string>("service_instance_id", nodePublicKey);
+    setup.serviceInstanceId = section.valueOr<std::string>(key::serviceInstanceId, nodePublicKey);
 
-    setup.exporterEndpoint =
-        section.valueOr<std::string>("endpoint", "http://localhost:4318/v1/traces");
+    setup.exporterEndpoint = section.valueOr<std::string>(key::endpoint, dflt::endpoint);
 
-    setup.useTls = section.valueOr<int>("use_tls", 0) != 0;
-    setup.tlsCertPath = section.valueOr<std::string>("tls_ca_cert", "");
+    setup.useTls = section.valueOr<int>(key::useTls, 0) != 0;
+    setup.tlsCertPath = section.valueOr<std::string>(key::tlsCaCert, "");
 
-    setup.samplingRatio = section.valueOr<double>("sampling_ratio", 1.0);
-    setup.samplingRatio = std::clamp(setup.samplingRatio, 0.0, 1.0);
+    // Head sampling is intentionally fixed at 1.0 (sample everything) and is
+    // not read from config. A per-node ratio would let nodes make divergent
+    // keep/drop decisions for the same distributed trace, producing broken
+    // traces; volume reduction is delegated to the collector's tail sampling.
+    // setup.samplingRatio is a const member fixed at 1.0; nothing to parse.
 
-    setup.batchSize = section.valueOr<std::uint32_t>("batch_size", 512u);
-    setup.batchDelay =
-        std::chrono::milliseconds{section.valueOr<std::uint32_t>("batch_delay_ms", 5000u)};
-    setup.maxQueueSize = section.valueOr<std::uint32_t>("max_queue_size", 2048u);
+    setup.batchSize = section.valueOr<std::uint32_t>(key::batchSize, dflt::batchSize);
+    setup.batchDelay = std::chrono::milliseconds{
+        section.valueOr<std::uint32_t>(key::batchDelayMs, dflt::batchDelayMs)};
+    setup.maxQueueSize = section.valueOr<std::uint32_t>(key::maxQueueSize, dflt::maxQueueSize);
 
     setup.networkId = networkId;
     setup.networkType = networkTypeFromId(networkId);
 
-    setup.traceTransactions = section.valueOr<int>("trace_transactions", 1) != 0;
-    setup.traceConsensus = section.valueOr<int>("trace_consensus", 1) != 0;
-    setup.traceRpc = section.valueOr<int>("trace_rpc", 1) != 0;
-    setup.tracePeer = section.valueOr<int>("trace_peer", 1) != 0;
-    setup.traceLedger = section.valueOr<int>("trace_ledger", 1) != 0;
+    setup.traceTransactions = section.valueOr<int>(key::traceTransactions, 1) != 0;
+    setup.traceConsensus = section.valueOr<int>(key::traceConsensus, 1) != 0;
+    setup.traceRpc = section.valueOr<int>(key::traceRpc, 1) != 0;
+    setup.tracePeer = section.valueOr<int>(key::tracePeer, 1) != 0;
+    setup.traceLedger = section.valueOr<int>(key::traceLedger, 1) != 0;
 
     return setup;
 }