diff --git a/.github/scripts/levelization/results/ordering.txt b/.github/scripts/levelization/results/ordering.txt index 63e46c779d..a8d9e8fb3d 100644 --- a/.github/scripts/levelization/results/ordering.txt +++ b/.github/scripts/levelization/results/ordering.txt @@ -236,6 +236,8 @@ xrpl.shamap > xrpl.basics xrpl.shamap > xrpl.nodestore xrpl.shamap > xrpl.protocol xrpl.telemetry > xrpl.basics +xrpl.telemetry > xrpld.consensus +xrpl.telemetry > xrpld.rpc xrpl.tx > xrpl.basics xrpl.tx > xrpl.core xrpl.tx > xrpl.ledger diff --git a/OpenTelemetryPlan/00-tracing-fundamentals.md b/OpenTelemetryPlan/00-tracing-fundamentals.md index 24322bdd09..d7cc40c5ac 100644 --- a/OpenTelemetryPlan/00-tracing-fundamentals.md +++ b/OpenTelemetryPlan/00-tracing-fundamentals.md @@ -514,12 +514,19 @@ Not every trace needs to be recorded. **Sampling** reduces overhead: ### Head Sampling (at trace start) ``` -Request arrives → Random 10% chance → Record or skip entire trace +Request arrives → Random N% chance → Record or skip entire trace ``` - ✅ Low overhead - ❌ May miss interesting traces +> **xrpld note**: xrpld intentionally fixes head sampling at 100% (sample +> everything) and does not expose a configurable ratio. A per-node ratio +> would let different nodes make divergent keep/drop decisions for the same +> distributed trace, producing broken/partial traces. xrpld uses a +> `ParentBased` sampler so spans with a remote parent honor the upstream +> decision. Volume reduction is delegated to collector-side tail sampling. + ### Tail Sampling (after trace completes) ``` diff --git a/OpenTelemetryPlan/04-code-samples.md b/OpenTelemetryPlan/04-code-samples.md index 8d38f69333..cd5c811f7b 100644 --- a/OpenTelemetryPlan/04-code-samples.md +++ b/OpenTelemetryPlan/04-code-samples.md @@ -53,8 +53,10 @@ public: bool useTls = false; std::string tlsCertPath; - // Sampling configuration - double samplingRatio = 1.0; // 1.0 = 100% sampling + // Head sampling: fixed at 1.0 (sample everything), not config-driven. + // Keeps trace keep/drop decisions coherent across nodes; volume + // reduction is delegated to the collector's tail sampling. + double samplingRatio = 1.0; // Batch processor settings std::uint32_t batchSize = 512; diff --git a/OpenTelemetryPlan/05-configuration-reference.md b/OpenTelemetryPlan/05-configuration-reference.md index 2b88cefbf3..3db4749ce2 100644 --- a/OpenTelemetryPlan/05-configuration-reference.md +++ b/OpenTelemetryPlan/05-configuration-reference.md @@ -37,12 +37,11 @@ Add to `cfg/xrpld-example.cfg`: # # Path to CA certificate for TLS (optional) # # tls_ca_cert=/path/to/ca.crt # -# # Sampling ratio: 0.0-1.0 (default: 1.0 = 100% sampling) -# # Use lower values in production to reduce overhead -# # Default: 1.0 (all traces). For production deployments with high -# # throughput, 0.1 (10%) is recommended to reduce overhead. -# # See Section 7.4.2 for sampling strategy details. -# sampling_ratio=0.1 +# # Head sampling is intentionally fixed at 1.0 (sample everything) and is +# # NOT configurable. A per-node head-sampling ratio would let nodes make +# # divergent keep/drop decisions for the same distributed trace, producing +# # broken/partial traces across the network. Volume reduction is delegated +# # to the collector's tail sampling instead. See Section 7.4.2. # # # Batch processor settings # batch_size=512 # Spans per batch (default: 512) @@ -86,7 +85,6 @@ enabled=0 | `endpoint` | string | `http://localhost:4318/v1/traces` | OTLP/HTTP collector endpoint | | `use_tls` | bool | `false` | Enable TLS for exporter connection | | `tls_ca_cert` | string | `""` | Path to CA certificate file | -| `sampling_ratio` | float | `1.0` | Sampling ratio (0.0-1.0) | | `batch_size` | uint | `512` | Spans per export batch | | `batch_delay_ms` | uint | `5000` | Max delay before sending batch (ms) | | `max_queue_size` | uint | `2048` | Maximum queued spans | @@ -153,13 +151,8 @@ setupTelemetry( setup.useTls = section.value_or("use_tls", false); setup.tlsCertPath = section.value_or("tls_ca_cert", ""); - // Sampling - setup.samplingRatio = section.value_or("sampling_ratio", 1.0); - if (setup.samplingRatio < 0.0 || setup.samplingRatio > 1.0) - { - Throw( - "telemetry.sampling_ratio must be between 0.0 and 1.0"); - } + // Head sampling is fixed at 1.0 (sample everything) and is not read from + // config — see Section 7.4.2. setup.samplingRatio stays at its 1.0 default. // Batch processor setup.batchSize = section.value_or("batch_size", 512u); diff --git a/OpenTelemetryPlan/07-observability-backends.md b/OpenTelemetryPlan/07-observability-backends.md index a1c303b545..5d1638670a 100644 --- a/OpenTelemetryPlan/07-observability-backends.md +++ b/OpenTelemetryPlan/07-observability-backends.md @@ -171,7 +171,7 @@ flowchart TB ```mermaid flowchart LR subgraph head["Head Sampling (Node)"] - hs[Node-level head sampling
configurable, default: 100%
recommended production: 10%] + hs[Node-level head sampling
fixed at 100%
not configurable] end subgraph tail["Tail Sampling (Collector)"] @@ -197,7 +197,7 @@ flowchart LR **Reading the diagram:** -- **Head Sampling (Node)**: The first filter -- each xrpld node decides whether to sample a trace at creation time (default 100%, recommended 10% in production). This controls the volume leaving the node. +- **Head Sampling (Node)**: xrpld pins head sampling at 100% (sample everything) and does not expose a configurable ratio. This is intentional: a per-node ratio would let different nodes make divergent keep/drop decisions for the same distributed trace, producing broken/partial traces. xrpld uses a `ParentBased` sampler so spans inheriting a remote parent honor the upstream decision. Volume reduction is delegated to the collector's tail sampling. - **Tail Sampling (Collector)**: The second filter -- the collector inspects completed traces and applies rules: keep all errors, keep anything slower than 5 seconds, and keep 10% of the remainder. - **Arrow head → tail**: All head-sampled traces flow to the collector, where tail sampling further reduces volume while preserving the most valuable data. - **Final Traces**: The output after both sampling stages; this is what gets stored and queried. The two-stage approach balances cost with debuggability. diff --git a/OpenTelemetryPlan/OpenTelemetryPlan.md b/OpenTelemetryPlan/OpenTelemetryPlan.md index f606bde86c..82af33dbf4 100644 --- a/OpenTelemetryPlan/OpenTelemetryPlan.md +++ b/OpenTelemetryPlan/OpenTelemetryPlan.md @@ -152,7 +152,7 @@ Span naming follows a hierarchical `.` convention (e.g., ` The telemetry code is organized under `include/xrpl/telemetry/` for headers and `src/libxrpl/telemetry/` for implementation. Key principles include RAII-based span management via `SpanGuard` (with `discard()` for dropping unwanted spans), a `FilteringSpanProcessor` that intercepts `OnEnd()` to prevent discarded spans from entering the export pipeline, conditional compilation with `XRPL_ENABLE_TELEMETRY`, and minimal runtime overhead through batch processing and efficient sampling. -Performance optimization strategies include probabilistic head sampling (10% default), tail-based sampling at the collector for errors and slow traces, batch export to reduce network overhead, and conditional instrumentation that compiles to no-ops when disabled. +Performance optimization strategies include head sampling fixed at 100% (intentionally not configurable, so trace keep/drop decisions stay coherent across nodes), tail-based sampling at the collector for errors and slow traces to reduce volume, batch export to reduce network overhead, and conditional instrumentation that compiles to no-ops when disabled. ➡️ **[Read full Implementation Strategy](./03-implementation-strategy.md)** diff --git a/cfg/xrpld-example.cfg b/cfg/xrpld-example.cfg index 9be3aa32f5..836e6d10aa 100644 --- a/cfg/xrpld-example.cfg +++ b/cfg/xrpld-example.cfg @@ -1664,14 +1664,13 @@ validators.txt # Path to a PEM-encoded CA certificate bundle for TLS verification. # Only used when use_tls=1. Default: empty (system CA store). # -# sampling_ratio=1.0 -# -# Head-based sampling ratio using TraceIdRatioBasedSampler. The decision -# to record or drop a trace is made at span creation time, before the -# span starts, based on the trace ID. Values in [0.0, 1.0]. -# 1.0 = trace everything, 0.1 = sample ~10% of traces. Default: 1.0. -# For tail-based (post-hoc) filtering — where you decide to drop a span -# after inspecting its content — use SpanGuard::discard() in code. +# Head sampling is intentionally fixed at 1.0 (sample everything) and is +# not configurable. A per-node sampling ratio would let nodes make +# divergent keep/drop decisions for the same distributed trace, producing +# broken/partial traces. A ParentBasedSampler ensures spans inheriting a +# remote parent honor the upstream decision. Reduce volume at the collector +# via tail sampling instead; for node-local post-hoc dropping use +# SpanGuard::discard() in code. # # trace_rpc=1 # diff --git a/docs/build/telemetry.md b/docs/build/telemetry.md index 1b4beae2ec..fe5f7934aa 100644 --- a/docs/build/telemetry.md +++ b/docs/build/telemetry.md @@ -10,22 +10,11 @@ This document explains how to build xrpld with OpenTelemetry distributed tracing - [Install dependencies](#install-dependencies) - [Call CMake](#call-cmake) - [Build](#build) - - [Building without telemetry](#building-without-telemetry) - - [Runtime Configuration](#runtime-configuration) - - [Configuration options](#configuration-options) - - [Observability Stack](#observability-stack) - - [Start the stack](#start-the-stack) - - [Verify the stack](#verify-the-stack) - - [View traces in Grafana Explore](#view-traces-in-grafana-explore) - - [Running Tests](#running-tests) + - [Building without telemetry](#building-without-telemetry) - [Troubleshooting](#troubleshooting) - - [No traces appear in Grafana](#no-traces-appear-in-grafana) - [Conan lockfile error](#conan-lockfile-error) - [CMake target not found](#cmake-target-not-found) - - [Architecture](#architecture) - - [Key files](#key-files) - - [Span discard mechanism](#span-discard-mechanism) - - [Conditional compilation](#conditional-compilation) + - [Conditional compilation](#conditional-compilation) ## Overview @@ -106,7 +95,7 @@ You should see in the CMake output: cmake --build . --parallel $(nproc) ``` -### Building without telemetry +## Building without telemetry Omit the `-o telemetry=True` option (or pass `-o telemetry=False`). The `opentelemetry-cpp` dependency will not be downloaded, @@ -114,127 +103,8 @@ the `XRPL_ENABLE_TELEMETRY` preprocessor define will not be set, and all tracing macros will compile to no-ops. The resulting binary is identical to one built before telemetry support was added. -## Runtime Configuration - -Add a `[telemetry]` section to your `xrpld.cfg` file: - -```ini -[telemetry] -enabled=1 -endpoint=http://localhost:4318/v1/traces -sampling_ratio=1.0 -trace_rpc=1 -trace_transactions=1 -trace_consensus=1 -trace_peer=1 -trace_ledger=1 -``` - -### Configuration options - -| Option | Type | Default | Description | -| --------------------- | ------ | --------------------------------- | -------------------------------------------------- | -| `enabled` | int | `0` | Enable (`1`) or disable (`0`) telemetry at runtime | -| `service_name` | string | `xrpld` | Service name reported in traces | -| `service_instance_id` | string | node public key | Unique instance identifier | -| `endpoint` | string | `http://localhost:4318/v1/traces` | OTLP/HTTP collector endpoint | -| `use_tls` | int | `0` | Enable TLS for the exporter connection | -| `tls_ca_cert` | string | (empty) | Path to CA certificate for TLS | -| `sampling_ratio` | double | `1.0` | Head-based sampling ratio (`0.0` to `1.0`) | -| `batch_size` | uint32 | `512` | Maximum spans per export batch | -| `batch_delay_ms` | uint32 | `5000` | Maximum delay (ms) before flushing a batch | -| `max_queue_size` | uint32 | `2048` | Maximum spans queued in memory | -| `trace_rpc` | int | `1` | Enable RPC request tracing | -| `trace_transactions` | int | `1` | Enable transaction lifecycle tracing | -| `trace_consensus` | int | `1` | Enable consensus round tracing | -| `trace_peer` | int | `1` | Enable peer message tracing (high volume) | -| `trace_ledger` | int | `1` | Enable ledger close tracing | - -## Observability Stack - -A Docker Compose stack is provided in `docker/telemetry/` with three services: - -| Service | Port | Purpose | -| ------------------ | ---------------------------------------------- | --------------------------------------------------- | -| **OTel Collector** | `4317` (gRPC), `4318` (HTTP), `13133` (health) | Receives OTLP spans, batches, and forwards to Tempo | -| **Tempo** | `3200` (HTTP API) | Trace storage backend | -| **Grafana** | `3000` | Dashboards (Tempo pre-configured as datasource) | - -### Start the stack - -```bash -docker compose -f docker/telemetry/docker-compose.yml up -d -``` - -### Verify the stack - -```bash -# Collector health -curl http://localhost:13133 - -# Grafana (Explore -> Tempo for traces) -open http://localhost:3000 -``` - -### View traces in Grafana Explore - -1. Open `http://localhost:3000` in a browser. -2. Navigate to **Explore** and select the **Tempo** datasource. -3. Use **Search** or **TraceQL** to find traces by service name (e.g. `xrpld`). -4. Click into any trace to see the span tree and attributes. - -Traced RPC operations produce a span hierarchy like: - -``` -rpc.request - └── rpc.command.server_info (command=server_info, rpc_status=success) -``` - -Each span includes attributes: - -- `command` — the RPC method name -- `version` — API version -- `rpc_role` — `admin` or `user` -- `rpc_status` — `success` or `error` - -## Running Tests - -Unit tests run with the telemetry-enabled build regardless of whether the -observability stack is running. When no collector is available, the exporter -silently drops spans with no impact on test results. - -```bash -# Run all RPC tests -./xrpld --unittest=RPCCall,ServerInfo,AccountTx,LedgerRPC,Transaction --unittest-jobs $(nproc) - -# Run the full test suite -./xrpld --unittest --unittest-jobs $(nproc) -``` - -To generate traces during manual testing, start xrpld in standalone mode: - -```bash -./xrpld --conf /path/to/xrpld.cfg --standalone --start -``` - -Then send RPC requests: - -```bash -curl -s -X POST http://127.0.0.1:5005/ \ - -H "Content-Type: application/json" \ - -d '{"method":"server_info","params":[{}]}' -``` - ## Troubleshooting -### No traces appear in Grafana - -1. Confirm the OTel Collector is running: `docker compose -f docker/telemetry/docker-compose.yml ps` -2. Check collector logs for errors: `docker compose -f docker/telemetry/docker-compose.yml logs otel-collector` -3. Confirm `[telemetry] enabled=1` is set in the xrpld config. -4. Confirm `endpoint` points to the correct collector address (`http://localhost:4318/v1/traces`). -5. Wait for the batch delay to elapse (default `5000` ms) before checking Grafana Explore. - ### Conan lockfile error If you see `ERROR: Requirement 'opentelemetry-cpp/1.26.0' not in lockfile 'requires'`, @@ -249,48 +119,7 @@ Conan-generated toolchain file is being used. The Conan package provides a single umbrella target `opentelemetry-cpp::opentelemetry-cpp` (not individual component targets). -## Architecture - -### Key files - -| File | Purpose | -| --------------------------------------------- | ------------------------------------------------------------ | -| `include/xrpl/telemetry/Telemetry.h` | Abstract telemetry interface and `Setup` struct | -| `include/xrpl/telemetry/SpanGuard.h` | RAII span guard with `discard()` for dropping unwanted spans | -| `include/xrpl/telemetry/DiscardFlag.h` | Thread-local discard flag (zero-dependency header) | -| `src/libxrpl/telemetry/Telemetry.cpp` | OTel SDK setup, `FilteringSpanProcessor`, provider lifecycle | -| `src/libxrpl/telemetry/TelemetryConfig.cpp` | Config parser (`setupTelemetry()`) | -| `src/libxrpl/telemetry/NullTelemetry.cpp` | No-op implementation (used when disabled) | -| `src/libxrpl/telemetry/SpanGuard.cpp` | Pimpl implementation for SpanGuard (all OTel types confined) | -| `src/xrpld/rpc/detail/ServerHandler.cpp` | RPC entry point instrumentation | -| `src/xrpld/rpc/detail/RPCHandler.cpp` | Per-command instrumentation | -| `docker/telemetry/docker-compose.yml` | Observability stack (Collector + Tempo + Grafana) | -| `docker/telemetry/otel-collector-config.yaml` | OTel Collector pipeline configuration | - -### Span discard mechanism - -`SpanGuard::discard()` allows callers to silently drop spans that turn out to be -uninteresting (e.g., failed preflight transactions). This saves both network bandwidth -and storage by preventing the span from being exported. - -The mechanism uses a thread-local flag (`tl_discardCurrentSpan` in `DiscardFlag.h`) as a -side-channel to the `FilteringSpanProcessor` (in `Telemetry.cpp`): - -1. `SpanGuard::discard()` sets the thread-local flag and calls `Span::End()` -2. The OTel SDK calls `FilteringSpanProcessor::OnEnd()` synchronously on the same thread -3. The processor checks the flag, clears it, and drops the span before it enters the batch queue - -```cpp -SpanGuard guard(telemetry.startSpan("tx.process")); -auto result = preflight(tx); -if (result != tesSUCCESS) -{ - guard.discard(); // span is dropped, never exported - return result; -} -``` - -### Conditional compilation +## Conditional compilation All OpenTelemetry SDK types are hidden behind the pimpl idiom in `SpanGuard.cpp`. When `XRPL_ENABLE_TELEMETRY` is not defined, `SpanGuard.h` provides an all-inline diff --git a/include/xrpl/telemetry/SpanGuard.h b/include/xrpl/telemetry/SpanGuard.h index d944e12406..864d3aba54 100644 --- a/include/xrpl/telemetry/SpanGuard.h +++ b/include/xrpl/telemetry/SpanGuard.h @@ -45,11 +45,16 @@ Usage examples: + Span names and attribute keys come from per-module `*SpanNames.h` + headers (e.g. RpcSpanNames.h, TxSpanNames.h) as typed compile-time + constants — never raw string literals — so the naming spec is + enforced at the call site and dashboards stay in sync. + 1. Basic RPC tracing (factory method with category): @code #include + using namespace xrpl::telemetry; - // At the call site (constants from RpcSpanNames.h): auto span = SpanGuard::span( TraceCategory::Rpc, rpc_span::prefix::command, "submit"); span.setAttribute(rpc_span::attr::command, "submit"); @@ -71,19 +76,22 @@ 3. Cross-thread context propagation: @code + #include + using namespace xrpl::telemetry; + // Thread A: create span and capture context auto span = SpanGuard::span( - TraceCategory::Consensus, seg::consensus, "round"); + TraceCategory::Consensus, seg::consensus, consensus::span::op::round); auto ctx = span.captureContext(); // Thread B: create child with captured context - auto child = SpanGuard::childSpan("consensus.accept", ctx); + auto child = SpanGuard::childSpan(consensus::span::accept, ctx); @endcode 4. Conditional check (rarely needed — methods are no-ops on null): @code auto span = SpanGuard::span( - TraceCategory::Rpc, rpc_span::prefix::rpc, "request"); + TraceCategory::Rpc, rpc_span::prefix::rpc, rpc_span::op::httpRequest); if (span) { // expensive attribute computation only when active span.setAttribute(rpc_span::attr::requestPayloadSize, computeSize()); @@ -93,7 +101,7 @@ 5. Tail-based filtering via discard(): @code auto span = SpanGuard::span( - TraceCategory::Transactions, seg::tx, "process"); + TraceCategory::Transactions, tx_span::prefix::tx, tx_span::op::process); auto result = preflight(tx); if (result != tesSUCCESS) { span.discard(); // drop span, never exported diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index 4bdfd0c840..2ba3de8409 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -163,13 +163,16 @@ public: /** Path to a CA certificate bundle for TLS verification. */ std::string tlsCertPath; - /** Head-based sampling ratio in [0.0, 1.0]. 1.0 = trace everything. - This is a head-based (pre-decision) sampler using - TraceIdRatioBasedSampler — the decision to record or drop a - trace is made before the root span starts. For post-hoc - (tail-based) filtering, see SpanGuard::discard(). + /** Head-based sampling ratio. Intentionally fixed at 1.0 (sample + everything) and NOT read from config. A per-node ratio would let + nodes make divergent keep/drop decisions for the same distributed + trace, producing broken/partial traces. The ratio sampler is wrapped + in a ParentBasedSampler (see Telemetry.cpp) so spans inheriting a + remote parent honor the upstream sampled flag. Volume reduction is + delegated to the collector's tail sampling; for node-local post-hoc + dropping see SpanGuard::discard(). */ - double samplingRatio = 1.0; + double const samplingRatio = 1.0; /** Maximum number of spans per batch export. */ std::uint32_t batchSize = 512; diff --git a/src/libxrpl/telemetry/Telemetry.cpp b/src/libxrpl/telemetry/Telemetry.cpp index 2a61b5a7e5..4338746789 100644 --- a/src/libxrpl/telemetry/Telemetry.cpp +++ b/src/libxrpl/telemetry/Telemetry.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -301,8 +302,15 @@ public: {std::string(attr::networkType), setup_.networkType}, // LCOV_EXCL_LINE }); - // Configure sampler - auto sampler = std::make_unique(setup_.samplingRatio); + // Configure sampler. Head sampling is fixed at 1.0 (sample everything); + // setup_.samplingRatio is not config-driven. Wrap the ratio sampler in a + // ParentBasedSampler so spans with a remote parent honor the upstream + // sampled flag — this keeps keep/drop decisions coherent for a single + // distributed trace spanning multiple nodes. Volume reduction is left to + // the collector's tail sampling. + auto rootSampler = + std::make_shared(setup_.samplingRatio); + auto sampler = trace_sdk::ParentBasedSamplerFactory::Create(std::move(rootSampler)); // Create TracerProvider sdkProvider_ = trace_sdk::TracerProviderFactory::Create( diff --git a/src/libxrpl/telemetry/TelemetryConfig.cpp b/src/libxrpl/telemetry/TelemetryConfig.cpp index 39f0206ce1..0863623f66 100644 --- a/src/libxrpl/telemetry/TelemetryConfig.cpp +++ b/src/libxrpl/telemetry/TelemetryConfig.cpp @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include @@ -18,6 +18,45 @@ namespace xrpl::telemetry { namespace { +/** Config key names for the [telemetry] section. + + Each must match the corresponding option documented in + cfg/xrpld-example.cfg verbatim. Defined as `char const*` so they + pass to Section::valueOr() (which takes `std::string const&`) + without an explicit conversion, exactly as a literal would. +*/ +namespace key { +constexpr char const* enabled = "enabled"; +constexpr char const* serviceName = "service_name"; +constexpr char const* serviceInstanceId = "service_instance_id"; +constexpr char const* endpoint = "endpoint"; +constexpr char const* useTls = "use_tls"; +constexpr char const* tlsCaCert = "tls_ca_cert"; +constexpr char const* batchSize = "batch_size"; +constexpr char const* batchDelayMs = "batch_delay_ms"; +constexpr char const* maxQueueSize = "max_queue_size"; +constexpr char const* traceTransactions = "trace_transactions"; +constexpr char const* traceConsensus = "trace_consensus"; +constexpr char const* traceRpc = "trace_rpc"; +constexpr char const* tracePeer = "trace_peer"; +constexpr char const* traceLedger = "trace_ledger"; +} // namespace key + +/** Default values applied when a key is absent from the config. + + @note serviceName mirrors SystemParameters' systemName() ("xrpld") but + is duplicated here as a literal: the telemetry module deliberately does + not link xrpl.libxrpl.protocol, so including SystemParameters.h would + introduce an undeclared cross-module dependency. +*/ +namespace dflt { +constexpr char const* serviceName = "xrpld"; +constexpr char const* endpoint = "http://localhost:4318/v1/traces"; +constexpr std::uint32_t batchSize = 512u; +constexpr std::uint32_t batchDelayMs = 5000u; +constexpr std::uint32_t maxQueueSize = 2048u; +} // namespace dflt + /** Derive a human-readable network type label from the numeric network ID. @param networkId The network identifier from [network_id] config. @return "mainnet", "testnet", "devnet", or "unknown" for other values. @@ -49,33 +88,35 @@ setupTelemetry( { Telemetry::Setup setup; - setup.enabled = section.valueOr("enabled", 0) != 0; - setup.serviceName = section.valueOr("service_name", "xrpld"); + setup.enabled = section.valueOr(key::enabled, 0) != 0; + setup.serviceName = section.valueOr(key::serviceName, dflt::serviceName); setup.serviceVersion = version; - setup.serviceInstanceId = section.valueOr("service_instance_id", nodePublicKey); + setup.serviceInstanceId = section.valueOr(key::serviceInstanceId, nodePublicKey); - setup.exporterEndpoint = - section.valueOr("endpoint", "http://localhost:4318/v1/traces"); + setup.exporterEndpoint = section.valueOr(key::endpoint, dflt::endpoint); - setup.useTls = section.valueOr("use_tls", 0) != 0; - setup.tlsCertPath = section.valueOr("tls_ca_cert", ""); + setup.useTls = section.valueOr(key::useTls, 0) != 0; + setup.tlsCertPath = section.valueOr(key::tlsCaCert, ""); - setup.samplingRatio = section.valueOr("sampling_ratio", 1.0); - setup.samplingRatio = std::clamp(setup.samplingRatio, 0.0, 1.0); + // Head sampling is intentionally fixed at 1.0 (sample everything) and is + // not read from config. A per-node ratio would let nodes make divergent + // keep/drop decisions for the same distributed trace, producing broken + // traces; volume reduction is delegated to the collector's tail sampling. + // setup.samplingRatio is a const member fixed at 1.0; nothing to parse. - setup.batchSize = section.valueOr("batch_size", 512u); - setup.batchDelay = - std::chrono::milliseconds{section.valueOr("batch_delay_ms", 5000u)}; - setup.maxQueueSize = section.valueOr("max_queue_size", 2048u); + setup.batchSize = section.valueOr(key::batchSize, dflt::batchSize); + setup.batchDelay = std::chrono::milliseconds{ + section.valueOr(key::batchDelayMs, dflt::batchDelayMs)}; + setup.maxQueueSize = section.valueOr(key::maxQueueSize, dflt::maxQueueSize); setup.networkId = networkId; setup.networkType = networkTypeFromId(networkId); - setup.traceTransactions = section.valueOr("trace_transactions", 1) != 0; - setup.traceConsensus = section.valueOr("trace_consensus", 1) != 0; - setup.traceRpc = section.valueOr("trace_rpc", 1) != 0; - setup.tracePeer = section.valueOr("trace_peer", 1) != 0; - setup.traceLedger = section.valueOr("trace_ledger", 1) != 0; + setup.traceTransactions = section.valueOr(key::traceTransactions, 1) != 0; + setup.traceConsensus = section.valueOr(key::traceConsensus, 1) != 0; + setup.traceRpc = section.valueOr(key::traceRpc, 1) != 0; + setup.tracePeer = section.valueOr(key::tracePeer, 1) != 0; + setup.traceLedger = section.valueOr(key::traceLedger, 1) != 0; return setup; }