From fa71280795f5776626ffe706e6d1fe93d65eb840 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Tue, 9 Jun 2026 18:45:52 +0100 Subject: [PATCH] removed head sampling ratio from config Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> --- cfg/xrpld-example.cfg | 15 +++-- include/xrpl/telemetry/SpanGuard.h | 31 ++++++---- include/xrpl/telemetry/Telemetry.h | 15 +++-- src/libxrpl/telemetry/Telemetry.cpp | 12 +++- src/libxrpl/telemetry/TelemetryConfig.cpp | 73 +++++++++++++++++------ 5 files changed, 99 insertions(+), 47 deletions(-) diff --git a/cfg/xrpld-example.cfg b/cfg/xrpld-example.cfg index 34fa023e9e..f86074ae66 100644 --- a/cfg/xrpld-example.cfg +++ b/cfg/xrpld-example.cfg @@ -1648,14 +1648,13 @@ validators.txt # protobuf-encoded HTTP POST requests to this URL. # Default: http://localhost:4318/v1/traces. # -# sampling_ratio=1.0 -# -# Head-based sampling ratio using TraceIdRatioBasedSampler. The decision -# to record or drop a trace is made at span creation time, before the -# span starts, based on the trace ID. Values in [0.0, 1.0]. -# 1.0 = trace everything, 0.1 = sample ~10% of traces. Default: 1.0. -# For tail-based (post-hoc) filtering — where you decide to drop a span -# after inspecting its content — use SpanGuard::discard() in code. +# Head sampling is intentionally fixed at 1.0 (sample everything) and is +# not configurable. A per-node sampling ratio would let nodes make +# divergent keep/drop decisions for the same distributed trace, producing +# broken/partial traces. A ParentBasedSampler ensures spans inheriting a +# remote parent honor the upstream decision. Reduce volume at the collector +# via tail sampling instead; for node-local post-hoc dropping use +# SpanGuard::discard() in code. # # trace_rpc=1 # diff --git a/include/xrpl/telemetry/SpanGuard.h b/include/xrpl/telemetry/SpanGuard.h index 8ff955615c..d5e3a5781f 100644 --- a/include/xrpl/telemetry/SpanGuard.h +++ b/include/xrpl/telemetry/SpanGuard.h @@ -42,23 +42,27 @@ Usage examples: + Span names and attribute keys come from per-module `*SpanNames.h` + headers (e.g. RpcSpanNames.h, TxSpanNames.h) as typed compile-time + constants — never raw string literals — so the naming spec is + enforced at the call site and dashboards stay in sync. + 1. Basic RPC tracing (factory method with category): @code - // Define prefix at class level: - static constexpr std::string_view spanPrefix_ = "rpc.command"; + #include + using namespace xrpl::telemetry; - // At the call site: auto span = SpanGuard::span( - TraceCategory::Rpc, spanPrefix_, "submit"); - span.setAttribute("xrpl.rpc.command", "submit"); - span.setAttribute("xrpl.rpc.status", "success"); + TraceCategory::Rpc, rpc_span::prefix::command, "submit"); + span.setAttribute(rpc_span::attr::command, "submit"); + span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success); // span ended automatically on scope exit @endcode 2. Error recording: @code auto span = SpanGuard::span( - TraceCategory::Rpc, "rpc.command", "submit"); + TraceCategory::Rpc, rpc_span::prefix::command, "submit"); try { doWork(); span.setOk(); @@ -69,29 +73,32 @@ 3. Cross-thread context propagation: @code + #include + using namespace xrpl::telemetry; + // Thread A: create span and capture context auto span = SpanGuard::span( - TraceCategory::Consensus, "consensus", "round"); + TraceCategory::Consensus, seg::consensus, consensus::span::op::round); auto ctx = span.captureContext(); // Thread B: create child with captured context - auto child = SpanGuard::childSpan("consensus.accept", ctx); + auto child = SpanGuard::childSpan(consensus::span::accept, ctx); @endcode 4. Conditional check (rarely needed — methods are no-ops on null): @code auto span = SpanGuard::span( - TraceCategory::Rpc, "rpc", "request"); + TraceCategory::Rpc, rpc_span::prefix::rpc, rpc_span::op::httpRequest); if (span) { // expensive attribute computation only when active - span.setAttribute("xrpl.rpc.payload_size", computeSize()); + span.setAttribute(rpc_span::attr::requestPayloadSize, computeSize()); } @endcode 5. Tail-based filtering via discard(): @code auto span = SpanGuard::span( - TraceCategory::Transactions, "tx", "process"); + TraceCategory::Transactions, tx_span::prefix::tx, tx_span::op::process); auto result = preflight(tx); if (result != tesSUCCESS) { span.discard(); // drop span, never exported diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index 4ecd6e76a2..c6f92f4da8 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -150,13 +150,16 @@ public: /** Path to a CA certificate bundle for TLS verification. */ std::string tlsCertPath; - /** Head-based sampling ratio in [0.0, 1.0]. 1.0 = trace everything. - This is a head-based (pre-decision) sampler using - TraceIdRatioBasedSampler — the decision to record or drop a - trace is made before the root span starts. For post-hoc - (tail-based) filtering, see SpanGuard::discard(). + /** Head-based sampling ratio. Intentionally fixed at 1.0 (sample + everything) and NOT read from config. A per-node ratio would let + nodes make divergent keep/drop decisions for the same distributed + trace, producing broken/partial traces. The ratio sampler is wrapped + in a ParentBasedSampler (see Telemetry.cpp) so spans inheriting a + remote parent honor the upstream sampled flag. Volume reduction is + delegated to the collector's tail sampling; for node-local post-hoc + dropping see SpanGuard::discard(). */ - double samplingRatio = 1.0; + double const samplingRatio = 1.0; /** Maximum number of spans per batch export. */ std::uint32_t batchSize = 512; diff --git a/src/libxrpl/telemetry/Telemetry.cpp b/src/libxrpl/telemetry/Telemetry.cpp index 2721d749a6..c4425bda84 100644 --- a/src/libxrpl/telemetry/Telemetry.cpp +++ b/src/libxrpl/telemetry/Telemetry.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -299,8 +300,15 @@ public: {"xrpl.network.type", setup_.networkType}, }); - // Configure sampler - auto sampler = std::make_unique(setup_.samplingRatio); + // Configure sampler. Head sampling is fixed at 1.0 (sample everything); + // setup_.samplingRatio is not config-driven. Wrap the ratio sampler in a + // ParentBasedSampler so spans with a remote parent honor the upstream + // sampled flag — this keeps keep/drop decisions coherent for a single + // distributed trace spanning multiple nodes. Volume reduction is left to + // the collector's tail sampling. + auto rootSampler = + std::make_shared(setup_.samplingRatio); + auto sampler = trace_sdk::ParentBasedSamplerFactory::Create(std::move(rootSampler)); // Create TracerProvider sdkProvider_ = trace_sdk::TracerProviderFactory::Create( diff --git a/src/libxrpl/telemetry/TelemetryConfig.cpp b/src/libxrpl/telemetry/TelemetryConfig.cpp index 39f0206ce1..d90e1df05d 100644 --- a/src/libxrpl/telemetry/TelemetryConfig.cpp +++ b/src/libxrpl/telemetry/TelemetryConfig.cpp @@ -8,9 +8,10 @@ */ #include +#include #include -#include +#include #include #include @@ -18,6 +19,38 @@ namespace xrpl::telemetry { namespace { +/** Config key names for the [telemetry] section. + + Each must match the corresponding option documented in + cfg/xrpld-example.cfg verbatim. Defined as `char const*` so they + pass to Section::valueOr() (which takes `std::string const&`) + without an explicit conversion, exactly as a literal would. +*/ +namespace key { +constexpr char const* enabled = "enabled"; +constexpr char const* serviceName = "service_name"; +constexpr char const* serviceInstanceId = "service_instance_id"; +constexpr char const* endpoint = "endpoint"; +constexpr char const* useTls = "use_tls"; +constexpr char const* tlsCaCert = "tls_ca_cert"; +constexpr char const* batchSize = "batch_size"; +constexpr char const* batchDelayMs = "batch_delay_ms"; +constexpr char const* maxQueueSize = "max_queue_size"; +constexpr char const* traceTransactions = "trace_transactions"; +constexpr char const* traceConsensus = "trace_consensus"; +constexpr char const* traceRpc = "trace_rpc"; +constexpr char const* tracePeer = "trace_peer"; +constexpr char const* traceLedger = "trace_ledger"; +} // namespace key + +/** Default values applied when a key is absent from the config. */ +namespace dflt { +constexpr char const* endpoint = "http://localhost:4318/v1/traces"; +constexpr std::uint32_t batchSize = 512u; +constexpr std::uint32_t batchDelayMs = 5000u; +constexpr std::uint32_t maxQueueSize = 2048u; +} // namespace dflt + /** Derive a human-readable network type label from the numeric network ID. @param networkId The network identifier from [network_id] config. @return "mainnet", "testnet", "devnet", or "unknown" for other values. @@ -49,33 +82,35 @@ setupTelemetry( { Telemetry::Setup setup; - setup.enabled = section.valueOr("enabled", 0) != 0; - setup.serviceName = section.valueOr("service_name", "xrpld"); + setup.enabled = section.valueOr(key::enabled, 0) != 0; + setup.serviceName = section.valueOr(key::serviceName, systemName()); setup.serviceVersion = version; - setup.serviceInstanceId = section.valueOr("service_instance_id", nodePublicKey); + setup.serviceInstanceId = section.valueOr(key::serviceInstanceId, nodePublicKey); - setup.exporterEndpoint = - section.valueOr("endpoint", "http://localhost:4318/v1/traces"); + setup.exporterEndpoint = section.valueOr(key::endpoint, dflt::endpoint); - setup.useTls = section.valueOr("use_tls", 0) != 0; - setup.tlsCertPath = section.valueOr("tls_ca_cert", ""); + setup.useTls = section.valueOr(key::useTls, 0) != 0; + setup.tlsCertPath = section.valueOr(key::tlsCaCert, ""); - setup.samplingRatio = section.valueOr("sampling_ratio", 1.0); - setup.samplingRatio = std::clamp(setup.samplingRatio, 0.0, 1.0); + // Head sampling is intentionally fixed at 1.0 (sample everything) and is + // not read from config. A per-node ratio would let nodes make divergent + // keep/drop decisions for the same distributed trace, producing broken + // traces; volume reduction is delegated to the collector's tail sampling. + // setup.samplingRatio is a const member fixed at 1.0; nothing to parse. - setup.batchSize = section.valueOr("batch_size", 512u); - setup.batchDelay = - std::chrono::milliseconds{section.valueOr("batch_delay_ms", 5000u)}; - setup.maxQueueSize = section.valueOr("max_queue_size", 2048u); + setup.batchSize = section.valueOr(key::batchSize, dflt::batchSize); + setup.batchDelay = std::chrono::milliseconds{ + section.valueOr(key::batchDelayMs, dflt::batchDelayMs)}; + setup.maxQueueSize = section.valueOr(key::maxQueueSize, dflt::maxQueueSize); setup.networkId = networkId; setup.networkType = networkTypeFromId(networkId); - setup.traceTransactions = section.valueOr("trace_transactions", 1) != 0; - setup.traceConsensus = section.valueOr("trace_consensus", 1) != 0; - setup.traceRpc = section.valueOr("trace_rpc", 1) != 0; - setup.tracePeer = section.valueOr("trace_peer", 1) != 0; - setup.traceLedger = section.valueOr("trace_ledger", 1) != 0; + setup.traceTransactions = section.valueOr(key::traceTransactions, 1) != 0; + setup.traceConsensus = section.valueOr(key::traceConsensus, 1) != 0; + setup.traceRpc = section.valueOr(key::traceRpc, 1) != 0; + setup.tracePeer = section.valueOr(key::tracePeer, 1) != 0; + setup.traceLedger = section.valueOr(key::traceLedger, 1) != 0; return setup; }