removed head sampling ratio from config

Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
This commit is contained in:
Pratik Mankawde
2026-06-09 18:45:52 +01:00
parent bb8f7f0e9a
commit fa71280795
5 changed files with 99 additions and 47 deletions

View File

@@ -1648,14 +1648,13 @@ validators.txt
# protobuf-encoded HTTP POST requests to this URL.
# Default: http://localhost:4318/v1/traces.
#
# sampling_ratio=1.0
#
# Head-based sampling ratio using TraceIdRatioBasedSampler. The decision
# to record or drop a trace is made at span creation time, before the
# span starts, based on the trace ID. Values in [0.0, 1.0].
# 1.0 = trace everything, 0.1 = sample ~10% of traces. Default: 1.0.
# For tail-based (post-hoc) filtering — where you decide to drop a span
# after inspecting its content — use SpanGuard::discard() in code.
# Head sampling is intentionally fixed at 1.0 (sample everything) and is
# not configurable. A per-node sampling ratio would let nodes make
# divergent keep/drop decisions for the same distributed trace, producing
# broken/partial traces. A ParentBasedSampler ensures spans inheriting a
# remote parent honor the upstream decision. Reduce volume at the collector
# via tail sampling instead; for node-local post-hoc dropping use
# SpanGuard::discard() in code.
#
# trace_rpc=1
#

View File

@@ -42,23 +42,27 @@
Usage examples:
Span names and attribute keys come from per-module `*SpanNames.h`
headers (e.g. RpcSpanNames.h, TxSpanNames.h) as typed compile-time
constants — never raw string literals — so the naming spec is
enforced at the call site and dashboards stay in sync.
1. Basic RPC tracing (factory method with category):
@code
// Define prefix at class level:
static constexpr std::string_view spanPrefix_ = "rpc.command";
#include <xrpld/rpc/detail/RpcSpanNames.h>
using namespace xrpl::telemetry;
// At the call site:
auto span = SpanGuard::span(
TraceCategory::Rpc, spanPrefix_, "submit");
span.setAttribute("xrpl.rpc.command", "submit");
span.setAttribute("xrpl.rpc.status", "success");
TraceCategory::Rpc, rpc_span::prefix::command, "submit");
span.setAttribute(rpc_span::attr::command, "submit");
span.setAttribute(rpc_span::attr::rpcStatus, rpc_span::val::success);
// span ended automatically on scope exit
@endcode
2. Error recording:
@code
auto span = SpanGuard::span(
TraceCategory::Rpc, "rpc.command", "submit");
TraceCategory::Rpc, rpc_span::prefix::command, "submit");
try {
doWork();
span.setOk();
@@ -69,29 +73,32 @@
3. Cross-thread context propagation:
@code
#include <xrpld/consensus/ConsensusSpanNames.h>
using namespace xrpl::telemetry;
// Thread A: create span and capture context
auto span = SpanGuard::span(
TraceCategory::Consensus, "consensus", "round");
TraceCategory::Consensus, seg::consensus, consensus::span::op::round);
auto ctx = span.captureContext();
// Thread B: create child with captured context
auto child = SpanGuard::childSpan("consensus.accept", ctx);
auto child = SpanGuard::childSpan(consensus::span::accept, ctx);
@endcode
4. Conditional check (rarely needed — methods are no-ops on null):
@code
auto span = SpanGuard::span(
TraceCategory::Rpc, "rpc", "request");
TraceCategory::Rpc, rpc_span::prefix::rpc, rpc_span::op::httpRequest);
if (span) {
// expensive attribute computation only when active
span.setAttribute("xrpl.rpc.payload_size", computeSize());
span.setAttribute(rpc_span::attr::requestPayloadSize, computeSize());
}
@endcode
5. Tail-based filtering via discard():
@code
auto span = SpanGuard::span(
TraceCategory::Transactions, "tx", "process");
TraceCategory::Transactions, tx_span::prefix::tx, tx_span::op::process);
auto result = preflight(tx);
if (result != tesSUCCESS) {
span.discard(); // drop span, never exported

View File

@@ -150,13 +150,16 @@ public:
/** Path to a CA certificate bundle for TLS verification. */
std::string tlsCertPath;
/** Head-based sampling ratio in [0.0, 1.0]. 1.0 = trace everything.
This is a head-based (pre-decision) sampler using
TraceIdRatioBasedSampler — the decision to record or drop a
trace is made before the root span starts. For post-hoc
(tail-based) filtering, see SpanGuard::discard().
/** Head-based sampling ratio. Intentionally fixed at 1.0 (sample
everything) and NOT read from config. A per-node ratio would let
nodes make divergent keep/drop decisions for the same distributed
trace, producing broken/partial traces. The ratio sampler is wrapped
in a ParentBasedSampler (see Telemetry.cpp) so spans inheriting a
remote parent honor the upstream sampled flag. Volume reduction is
delegated to the collector's tail sampling; for node-local post-hoc
dropping see SpanGuard::discard().
*/
double samplingRatio = 1.0;
double const samplingRatio = 1.0;
/** Maximum number of spans per batch export. */
std::uint32_t batchSize = 512;

View File

@@ -30,6 +30,7 @@
#include <opentelemetry/sdk/trace/batch_span_processor_options.h>
#include <opentelemetry/sdk/trace/processor.h>
#include <opentelemetry/sdk/trace/sampler.h>
#include <opentelemetry/sdk/trace/samplers/parent_factory.h>
#include <opentelemetry/sdk/trace/samplers/trace_id_ratio.h>
#include <opentelemetry/sdk/trace/tracer_provider.h>
#include <opentelemetry/sdk/trace/tracer_provider_factory.h>
@@ -299,8 +300,15 @@ public:
{"xrpl.network.type", setup_.networkType},
});
// Configure sampler
auto sampler = std::make_unique<trace_sdk::TraceIdRatioBasedSampler>(setup_.samplingRatio);
// Configure sampler. Head sampling is fixed at 1.0 (sample everything);
// setup_.samplingRatio is not config-driven. Wrap the ratio sampler in a
// ParentBasedSampler so spans with a remote parent honor the upstream
// sampled flag — this keeps keep/drop decisions coherent for a single
// distributed trace spanning multiple nodes. Volume reduction is left to
// the collector's tail sampling.
auto rootSampler =
std::make_shared<trace_sdk::TraceIdRatioBasedSampler>(setup_.samplingRatio);
auto sampler = trace_sdk::ParentBasedSamplerFactory::Create(std::move(rootSampler));
// Create TracerProvider
sdkProvider_ = trace_sdk::TracerProviderFactory::Create(

View File

@@ -8,9 +8,10 @@
*/
#include <xrpl/basics/BasicConfig.h>
#include <xrpl/protocol/SystemParameters.h>
#include <xrpl/telemetry/Telemetry.h>
#include <algorithm>
#include <chrono>
#include <cstdint>
#include <string>
@@ -18,6 +19,38 @@ namespace xrpl::telemetry {
namespace {
/** Config key names for the [telemetry] section.
Each must match the corresponding option documented in
cfg/xrpld-example.cfg verbatim. Defined as `char const*` so they
pass to Section::valueOr() (which takes `std::string const&`)
without an explicit conversion, exactly as a literal would.
*/
namespace key {
constexpr char const* enabled = "enabled";
constexpr char const* serviceName = "service_name";
constexpr char const* serviceInstanceId = "service_instance_id";
constexpr char const* endpoint = "endpoint";
constexpr char const* useTls = "use_tls";
constexpr char const* tlsCaCert = "tls_ca_cert";
constexpr char const* batchSize = "batch_size";
constexpr char const* batchDelayMs = "batch_delay_ms";
constexpr char const* maxQueueSize = "max_queue_size";
constexpr char const* traceTransactions = "trace_transactions";
constexpr char const* traceConsensus = "trace_consensus";
constexpr char const* traceRpc = "trace_rpc";
constexpr char const* tracePeer = "trace_peer";
constexpr char const* traceLedger = "trace_ledger";
} // namespace key
/** Default values applied when a key is absent from the config. */
namespace dflt {
constexpr char const* endpoint = "http://localhost:4318/v1/traces";
constexpr std::uint32_t batchSize = 512u;
constexpr std::uint32_t batchDelayMs = 5000u;
constexpr std::uint32_t maxQueueSize = 2048u;
} // namespace dflt
/** Derive a human-readable network type label from the numeric network ID.
@param networkId The network identifier from [network_id] config.
@return "mainnet", "testnet", "devnet", or "unknown" for other values.
@@ -49,33 +82,35 @@ setupTelemetry(
{
Telemetry::Setup setup;
setup.enabled = section.valueOr<int>("enabled", 0) != 0;
setup.serviceName = section.valueOr<std::string>("service_name", "xrpld");
setup.enabled = section.valueOr<int>(key::enabled, 0) != 0;
setup.serviceName = section.valueOr<std::string>(key::serviceName, systemName());
setup.serviceVersion = version;
setup.serviceInstanceId = section.valueOr<std::string>("service_instance_id", nodePublicKey);
setup.serviceInstanceId = section.valueOr<std::string>(key::serviceInstanceId, nodePublicKey);
setup.exporterEndpoint =
section.valueOr<std::string>("endpoint", "http://localhost:4318/v1/traces");
setup.exporterEndpoint = section.valueOr<std::string>(key::endpoint, dflt::endpoint);
setup.useTls = section.valueOr<int>("use_tls", 0) != 0;
setup.tlsCertPath = section.valueOr<std::string>("tls_ca_cert", "");
setup.useTls = section.valueOr<int>(key::useTls, 0) != 0;
setup.tlsCertPath = section.valueOr<std::string>(key::tlsCaCert, "");
setup.samplingRatio = section.valueOr<double>("sampling_ratio", 1.0);
setup.samplingRatio = std::clamp(setup.samplingRatio, 0.0, 1.0);
// Head sampling is intentionally fixed at 1.0 (sample everything) and is
// not read from config. A per-node ratio would let nodes make divergent
// keep/drop decisions for the same distributed trace, producing broken
// traces; volume reduction is delegated to the collector's tail sampling.
// setup.samplingRatio is a const member fixed at 1.0; nothing to parse.
setup.batchSize = section.valueOr<std::uint32_t>("batch_size", 512u);
setup.batchDelay =
std::chrono::milliseconds{section.valueOr<std::uint32_t>("batch_delay_ms", 5000u)};
setup.maxQueueSize = section.valueOr<std::uint32_t>("max_queue_size", 2048u);
setup.batchSize = section.valueOr<std::uint32_t>(key::batchSize, dflt::batchSize);
setup.batchDelay = std::chrono::milliseconds{
section.valueOr<std::uint32_t>(key::batchDelayMs, dflt::batchDelayMs)};
setup.maxQueueSize = section.valueOr<std::uint32_t>(key::maxQueueSize, dflt::maxQueueSize);
setup.networkId = networkId;
setup.networkType = networkTypeFromId(networkId);
setup.traceTransactions = section.valueOr<int>("trace_transactions", 1) != 0;
setup.traceConsensus = section.valueOr<int>("trace_consensus", 1) != 0;
setup.traceRpc = section.valueOr<int>("trace_rpc", 1) != 0;
setup.tracePeer = section.valueOr<int>("trace_peer", 1) != 0;
setup.traceLedger = section.valueOr<int>("trace_ledger", 1) != 0;
setup.traceTransactions = section.valueOr<int>(key::traceTransactions, 1) != 0;
setup.traceConsensus = section.valueOr<int>(key::traceConsensus, 1) != 0;
setup.traceRpc = section.valueOr<int>(key::traceRpc, 1) != 0;
setup.tracePeer = section.valueOr<int>(key::tracePeer, 1) != 0;
setup.traceLedger = section.valueOr<int>(key::traceLedger, 1) != 0;
return setup;
}