Merge branch 'pratik/otel-phase2-rpc-tracing' into pratik/otel-phase3-tx-tracing

Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
2026-07-26 00:20:41 +00:00 · 2026-06-09 19:05:40 +01:00
parent be67ad25e7 d3c09fd3f4
commit 8fe3f06999
12 changed files with 127 additions and 235 deletions
--- a/.github/scripts/levelization/results/ordering.txt
+++ b/.github/scripts/levelization/results/ordering.txt
@@ -236,6 +236,8 @@ xrpl.shamap > xrpl.basics
 xrpl.shamap > xrpl.nodestore
 xrpl.shamap > xrpl.protocol
 xrpl.telemetry > xrpl.basics
+xrpl.telemetry > xrpld.consensus
+xrpl.telemetry > xrpld.rpc
 xrpl.tx > xrpl.basics
 xrpl.tx > xrpl.core
 xrpl.tx > xrpl.ledger
--- a/OpenTelemetryPlan/00-tracing-fundamentals.md
+++ b/OpenTelemetryPlan/00-tracing-fundamentals.md
@@ -514,12 +514,19 @@ Not every trace needs to be recorded. **Sampling** reduces overhead:
 ### Head Sampling (at trace start)

 ```
-Request arrives → Random 10% chance → Record or skip entire trace
+Request arrives → Random N% chance → Record or skip entire trace
 ```

 - ✅ Low overhead
 - ❌ May miss interesting traces

+> **xrpld note**: xrpld intentionally fixes head sampling at 100% (sample
+> everything) and does not expose a configurable ratio. A per-node ratio
+> would let different nodes make divergent keep/drop decisions for the same
+> distributed trace, producing broken/partial traces. xrpld uses a
+> `ParentBased` sampler so spans with a remote parent honor the upstream
+> decision. Volume reduction is delegated to collector-side tail sampling.
+
 ### Tail Sampling (after trace completes)

 ```
--- a/OpenTelemetryPlan/04-code-samples.md
+++ b/OpenTelemetryPlan/04-code-samples.md
@@ -53,8 +53,10 @@ public:
        bool useTls = false;
        std::string tlsCertPath;

-        // Sampling configuration
-        double samplingRatio = 1.0;  // 1.0 = 100% sampling
+        // Head sampling: fixed at 1.0 (sample everything), not config-driven.
+        // Keeps trace keep/drop decisions coherent across nodes; volume
+        // reduction is delegated to the collector's tail sampling.
+        double samplingRatio = 1.0;

        // Batch processor settings
        std::uint32_t batchSize = 512;
--- a/OpenTelemetryPlan/05-configuration-reference.md
+++ b/OpenTelemetryPlan/05-configuration-reference.md
@@ -37,12 +37,11 @@ Add to `cfg/xrpld-example.cfg`:
 # # Path to CA certificate for TLS (optional)
 # # tls_ca_cert=/path/to/ca.crt
 #
-# # Sampling ratio: 0.0-1.0 (default: 1.0 = 100% sampling)
-# # Use lower values in production to reduce overhead
-# # Default: 1.0 (all traces). For production deployments with high
-# # throughput, 0.1 (10%) is recommended to reduce overhead.
-# # See Section 7.4.2 for sampling strategy details.
-# sampling_ratio=0.1
+# # Head sampling is intentionally fixed at 1.0 (sample everything) and is
+# # NOT configurable. A per-node head-sampling ratio would let nodes make
+# # divergent keep/drop decisions for the same distributed trace, producing
+# # broken/partial traces across the network. Volume reduction is delegated
+# # to the collector's tail sampling instead. See Section 7.4.2.
 #
 # # Batch processor settings
 # batch_size=512           # Spans per batch (default: 512)
@@ -86,7 +85,6 @@ enabled=0
 | `endpoint`                 | string | `http://localhost:4318/v1/traces` | OTLP/HTTP collector endpoint                                                                               |
 | `use_tls`                  | bool   | `false`                           | Enable TLS for exporter connection                                                                         |
 | `tls_ca_cert`              | string | `""`                              | Path to CA certificate file                                                                                |
-| `sampling_ratio`           | float  | `1.0`                             | Sampling ratio (0.0-1.0)                                                                                   |
 | `batch_size`               | uint   | `512`                             | Spans per export batch                                                                                     |
 | `batch_delay_ms`           | uint   | `5000`                            | Max delay before sending batch (ms)                                                                        |
 | `max_queue_size`           | uint   | `2048`                            | Maximum queued spans                                                                                       |
@@ -153,13 +151,8 @@ setupTelemetry(
    setup.useTls = section.value_or("use_tls", false);
    setup.tlsCertPath = section.value_or("tls_ca_cert", "");

-    // Sampling
-    setup.samplingRatio = section.value_or("sampling_ratio", 1.0);
-    if (setup.samplingRatio < 0.0 || setup.samplingRatio > 1.0)
-    {
-        Throw<std::runtime_error>(
-            "telemetry.sampling_ratio must be between 0.0 and 1.0");
-    }
+    // Head sampling is fixed at 1.0 (sample everything) and is not read from
+    // config — see Section 7.4.2. setup.samplingRatio stays at its 1.0 default.

    // Batch processor
    setup.batchSize = section.value_or("batch_size", 512u);
--- a/OpenTelemetryPlan/07-observability-backends.md
+++ b/OpenTelemetryPlan/07-observability-backends.md
@@ -171,7 +171,7 @@ flowchart TB
 ```mermaid
 flowchart LR
    subgraph head["Head Sampling (Node)"]
-        hs[Node-level head sampling<br/>configurable, default: 100%<br/>recommended production: 10%]
+        hs[Node-level head sampling<br/>fixed at 100%<br/>not configurable]
    end

    subgraph tail["Tail Sampling (Collector)"]
@@ -197,7 +197,7 @@ flowchart LR

 **Reading the diagram:**

- **Head Sampling (Node)**: The first filter -- each xrpld node decides whether to sample a trace at creation time (default 100%, recommended 10% in production). This controls the volume leaving the node.
+- **Head Sampling (Node)**: xrpld pins head sampling at 100% (sample everything) and does not expose a configurable ratio. This is intentional: a per-node ratio would let different nodes make divergent keep/drop decisions for the same distributed trace, producing broken/partial traces. xrpld uses a `ParentBased` sampler so spans inheriting a remote parent honor the upstream decision. Volume reduction is delegated to the collector's tail sampling.
 - **Tail Sampling (Collector)**: The second filter -- the collector inspects completed traces and applies rules: keep all errors, keep anything slower than 5 seconds, and keep 10% of the remainder.
 - **Arrow head → tail**: All head-sampled traces flow to the collector, where tail sampling further reduces volume while preserving the most valuable data.
 - **Final Traces**: The output after both sampling stages; this is what gets stored and queried. The two-stage approach balances cost with debuggability.
--- a/OpenTelemetryPlan/OpenTelemetryPlan.md
+++ b/OpenTelemetryPlan/OpenTelemetryPlan.md
@@ -152,7 +152,7 @@ Span naming follows a hierarchical `<component>.<operation>` convention (e.g., `

 The telemetry code is organized under `include/xrpl/telemetry/` for headers and `src/libxrpl/telemetry/` for implementation. Key principles include RAII-based span management via `SpanGuard` (with `discard()` for dropping unwanted spans), a `FilteringSpanProcessor` that intercepts `OnEnd()` to prevent discarded spans from entering the export pipeline, conditional compilation with `XRPL_ENABLE_TELEMETRY`, and minimal runtime overhead through batch processing and efficient sampling.

-Performance optimization strategies include probabilistic head sampling (10% default), tail-based sampling at the collector for errors and slow traces, batch export to reduce network overhead, and conditional instrumentation that compiles to no-ops when disabled.
+Performance optimization strategies include head sampling fixed at 100% (intentionally not configurable, so trace keep/drop decisions stay coherent across nodes), tail-based sampling at the collector for errors and slow traces to reduce volume, batch export to reduce network overhead, and conditional instrumentation that compiles to no-ops when disabled.

 ➡️ **[Read full Implementation Strategy](./03-implementation-strategy.md)**

--- a/cfg/xrpld-example.cfg
+++ b/cfg/xrpld-example.cfg
@@ -1664,14 +1664,13 @@ validators.txt
 #   Path to a PEM-encoded CA certificate bundle for TLS verification.
 #   Only used when use_tls=1. Default: empty (system CA store).
 #
-# sampling_ratio=1.0
-#
-#   Head-based sampling ratio using TraceIdRatioBasedSampler. The decision
-#   to record or drop a trace is made at span creation time, before the
-#   span starts, based on the trace ID. Values in [0.0, 1.0].
-#   1.0 = trace everything, 0.1 = sample ~10% of traces. Default: 1.0.
-#   For tail-based (post-hoc) filtering — where you decide to drop a span
-#   after inspecting its content — use SpanGuard::discard() in code.
+#   Head sampling is intentionally fixed at 1.0 (sample everything) and is
+#   not configurable. A per-node sampling ratio would let nodes make
+#   divergent keep/drop decisions for the same distributed trace, producing
+#   broken/partial traces. A ParentBasedSampler ensures spans inheriting a
+#   remote parent honor the upstream decision. Reduce volume at the collector
+#   via tail sampling instead; for node-local post-hoc dropping use
+#   SpanGuard::discard() in code.
 #
 # trace_rpc=1
 #
--- a/docs/build/telemetry.md
+++ b/docs/build/telemetry.md
@@ -10,22 +10,11 @@ This document explains how to build xrpld with OpenTelemetry distributed tracing
      - [Install dependencies](#install-dependencies)
      - [Call CMake](#call-cmake)
      - [Build](#build)
-    - [Building without telemetry](#building-without-telemetry)
-  - [Runtime Configuration](#runtime-configuration)
-    - [Configuration options](#configuration-options)
-  - [Observability Stack](#observability-stack)
-    - [Start the stack](#start-the-stack)
-    - [Verify the stack](#verify-the-stack)
-    - [View traces in Grafana Explore](#view-traces-in-grafana-explore)
-  - [Running Tests](#running-tests)
+  - [Building without telemetry](#building-without-telemetry)
  - [Troubleshooting](#troubleshooting)
-    - [No traces appear in Grafana](#no-traces-appear-in-grafana)
    - [Conan lockfile error](#conan-lockfile-error)
    - [CMake target not found](#cmake-target-not-found)
-  - [Architecture](#architecture)
-    - [Key files](#key-files)
-    - [Span discard mechanism](#span-discard-mechanism)
-    - [Conditional compilation](#conditional-compilation)
+  - [Conditional compilation](#conditional-compilation)

 ## Overview

@@ -106,7 +95,7 @@ You should see in the CMake output:
 cmake --build . --parallel $(nproc)
 ```

-### Building without telemetry
+## Building without telemetry

 Omit the `-o telemetry=True` option (or pass `-o telemetry=False`).
 The `opentelemetry-cpp` dependency will not be downloaded,
@@ -114,127 +103,8 @@ the `XRPL_ENABLE_TELEMETRY` preprocessor define will not be set,
 and all tracing macros will compile to no-ops.
 The resulting binary is identical to one built before telemetry support was added.

-## Runtime Configuration
-
-Add a `[telemetry]` section to your `xrpld.cfg` file:
-
-```ini
-[telemetry]
-enabled=1
-endpoint=http://localhost:4318/v1/traces
-sampling_ratio=1.0
-trace_rpc=1
-trace_transactions=1
-trace_consensus=1
-trace_peer=1
-trace_ledger=1
-```
-
-### Configuration options
-
-| Option                | Type   | Default                           | Description                                        |
-| --------------------- | ------ | --------------------------------- | -------------------------------------------------- |
-| `enabled`             | int    | `0`                               | Enable (`1`) or disable (`0`) telemetry at runtime |
-| `service_name`        | string | `xrpld`                           | Service name reported in traces                    |
-| `service_instance_id` | string | node public key                   | Unique instance identifier                         |
-| `endpoint`            | string | `http://localhost:4318/v1/traces` | OTLP/HTTP collector endpoint                       |
-| `use_tls`             | int    | `0`                               | Enable TLS for the exporter connection             |
-| `tls_ca_cert`         | string | (empty)                           | Path to CA certificate for TLS                     |
-| `sampling_ratio`      | double | `1.0`                             | Head-based sampling ratio (`0.0` to `1.0`)         |
-| `batch_size`          | uint32 | `512`                             | Maximum spans per export batch                     |
-| `batch_delay_ms`      | uint32 | `5000`                            | Maximum delay (ms) before flushing a batch         |
-| `max_queue_size`      | uint32 | `2048`                            | Maximum spans queued in memory                     |
-| `trace_rpc`           | int    | `1`                               | Enable RPC request tracing                         |
-| `trace_transactions`  | int    | `1`                               | Enable transaction lifecycle tracing               |
-| `trace_consensus`     | int    | `1`                               | Enable consensus round tracing                     |
-| `trace_peer`          | int    | `1`                               | Enable peer message tracing (high volume)          |
-| `trace_ledger`        | int    | `1`                               | Enable ledger close tracing                        |
-
-## Observability Stack
-
-A Docker Compose stack is provided in `docker/telemetry/` with three services:
-
-| Service            | Port                                           | Purpose                                             |
-| ------------------ | ---------------------------------------------- | --------------------------------------------------- |
-| **OTel Collector** | `4317` (gRPC), `4318` (HTTP), `13133` (health) | Receives OTLP spans, batches, and forwards to Tempo |
-| **Tempo**          | `3200` (HTTP API)                              | Trace storage backend                               |
-| **Grafana**        | `3000`                                         | Dashboards (Tempo pre-configured as datasource)     |
-
-### Start the stack
-
-```bash
-docker compose -f docker/telemetry/docker-compose.yml up -d
-```
-
-### Verify the stack
-
-```bash
-# Collector health
-curl http://localhost:13133
-
-# Grafana (Explore -> Tempo for traces)
-open http://localhost:3000
-```
-
-### View traces in Grafana Explore
-
-1. Open `http://localhost:3000` in a browser.
-2. Navigate to **Explore** and select the **Tempo** datasource.
-3. Use **Search** or **TraceQL** to find traces by service name (e.g. `xrpld`).
-4. Click into any trace to see the span tree and attributes.
-
-Traced RPC operations produce a span hierarchy like:
-
-```
-rpc.request
-  └── rpc.command.server_info  (command=server_info, rpc_status=success)
-```
-
-Each span includes attributes:
-
- `command` — the RPC method name
- `version` — API version
- `rpc_role` — `admin` or `user`
- `rpc_status` — `success` or `error`
-
-## Running Tests
-
-Unit tests run with the telemetry-enabled build regardless of whether the
-observability stack is running. When no collector is available, the exporter
-silently drops spans with no impact on test results.
-
-```bash
-# Run all RPC tests
-./xrpld --unittest=RPCCall,ServerInfo,AccountTx,LedgerRPC,Transaction --unittest-jobs $(nproc)
-
-# Run the full test suite
-./xrpld --unittest --unittest-jobs $(nproc)
-```
-
-To generate traces during manual testing, start xrpld in standalone mode:
-
-```bash
-./xrpld --conf /path/to/xrpld.cfg --standalone --start
-```
-
-Then send RPC requests:
-
-```bash
-curl -s -X POST http://127.0.0.1:5005/ \
-    -H "Content-Type: application/json" \
-    -d '{"method":"server_info","params":[{}]}'
-```
-
 ## Troubleshooting

-### No traces appear in Grafana
-
-1. Confirm the OTel Collector is running: `docker compose -f docker/telemetry/docker-compose.yml ps`
-2. Check collector logs for errors: `docker compose -f docker/telemetry/docker-compose.yml logs otel-collector`
-3. Confirm `[telemetry] enabled=1` is set in the xrpld config.
-4. Confirm `endpoint` points to the correct collector address (`http://localhost:4318/v1/traces`).
-5. Wait for the batch delay to elapse (default `5000` ms) before checking Grafana Explore.
-
 ### Conan lockfile error

 If you see `ERROR: Requirement 'opentelemetry-cpp/1.26.0' not in lockfile 'requires'`,
@@ -249,48 +119,7 @@ Conan-generated toolchain file is being used.
 The Conan package provides a single umbrella target
 `opentelemetry-cpp::opentelemetry-cpp` (not individual component targets).

-## Architecture
-
-### Key files
-
-| File                                          | Purpose                                                      |
-| --------------------------------------------- | ------------------------------------------------------------ |
-| `include/xrpl/telemetry/Telemetry.h`          | Abstract telemetry interface and `Setup` struct              |
-| `include/xrpl/telemetry/SpanGuard.h`          | RAII span guard with `discard()` for dropping unwanted spans |
-| `include/xrpl/telemetry/DiscardFlag.h`        | Thread-local discard flag (zero-dependency header)           |
-| `src/libxrpl/telemetry/Telemetry.cpp`         | OTel SDK setup, `FilteringSpanProcessor`, provider lifecycle |
-| `src/libxrpl/telemetry/TelemetryConfig.cpp`   | Config parser (`setupTelemetry()`)                           |
-| `src/libxrpl/telemetry/NullTelemetry.cpp`     | No-op implementation (used when disabled)                    |
-| `src/libxrpl/telemetry/SpanGuard.cpp`         | Pimpl implementation for SpanGuard (all OTel types confined) |
-| `src/xrpld/rpc/detail/ServerHandler.cpp`      | RPC entry point instrumentation                              |
-| `src/xrpld/rpc/detail/RPCHandler.cpp`         | Per-command instrumentation                                  |
-| `docker/telemetry/docker-compose.yml`         | Observability stack (Collector + Tempo + Grafana)            |
-| `docker/telemetry/otel-collector-config.yaml` | OTel Collector pipeline configuration                        |
-
-### Span discard mechanism
-
-`SpanGuard::discard()` allows callers to silently drop spans that turn out to be
-uninteresting (e.g., failed preflight transactions). This saves both network bandwidth
-and storage by preventing the span from being exported.
-
-The mechanism uses a thread-local flag (`tl_discardCurrentSpan` in `DiscardFlag.h`) as a
-side-channel to the `FilteringSpanProcessor` (in `Telemetry.cpp`):
-
-1. `SpanGuard::discard()` sets the thread-local flag and calls `Span::End()`
-2. The OTel SDK calls `FilteringSpanProcessor::OnEnd()` synchronously on the same thread
-3. The processor checks the flag, clears it, and drops the span before it enters the batch queue
-
-```cpp
-SpanGuard guard(telemetry.startSpan("tx.process"));
-auto result = preflight(tx);
-if (result != tesSUCCESS)
-{
-    guard.discard();  // span is dropped, never exported
-    return result;
-}
-```
-
-### Conditional compilation
+## Conditional compilation

 All OpenTelemetry SDK types are hidden behind the pimpl idiom in `SpanGuard.cpp`.
 When `XRPL_ENABLE_TELEMETRY` is not defined, `SpanGuard.h` provides an all-inline
--- a/include/xrpl/telemetry/SpanGuard.h
+++ b/include/xrpl/telemetry/SpanGuard.h
@@ -45,11 +45,16 @@

    Usage examples:

+    Span names and attribute keys come from per-module `*SpanNames.h`
+    headers (e.g. RpcSpanNames.h, TxSpanNames.h) as typed compile-time
+    constants — never raw string literals — so the naming spec is
+    enforced at the call site and dashboards stay in sync.
+
    1. Basic RPC tracing (factory method with category):
    @code
        #include <xrpld/rpc/detail/RpcSpanNames.h>
+        using namespace xrpl::telemetry;

-        // At the call site (constants from RpcSpanNames.h):
        auto span = SpanGuard::span(
            TraceCategory::Rpc, rpc_span::prefix::command, "submit");
        span.setAttribute(rpc_span::attr::command, "submit");
@@ -71,19 +76,22 @@

    3. Cross-thread context propagation:
    @code
+        #include <xrpld/consensus/ConsensusSpanNames.h>
+        using namespace xrpl::telemetry;
+
        // Thread A: create span and capture context
        auto span = SpanGuard::span(
-            TraceCategory::Consensus, seg::consensus, "round");
+            TraceCategory::Consensus, seg::consensus, consensus::span::op::round);
        auto ctx = span.captureContext();

        // Thread B: create child with captured context
-        auto child = SpanGuard::childSpan("consensus.accept", ctx);
+        auto child = SpanGuard::childSpan(consensus::span::accept, ctx);
    @endcode

    4. Conditional check (rarely needed — methods are no-ops on null):
    @code
        auto span = SpanGuard::span(
-            TraceCategory::Rpc, rpc_span::prefix::rpc, "request");
+            TraceCategory::Rpc, rpc_span::prefix::rpc, rpc_span::op::httpRequest);
        if (span) {
            // expensive attribute computation only when active
            span.setAttribute(rpc_span::attr::requestPayloadSize, computeSize());
@@ -93,7 +101,7 @@
    5. Tail-based filtering via discard():
    @code
        auto span = SpanGuard::span(
-            TraceCategory::Transactions, seg::tx, "process");
+            TraceCategory::Transactions, tx_span::prefix::tx, tx_span::op::process);
        auto result = preflight(tx);
        if (result != tesSUCCESS) {
            span.discard();  // drop span, never exported
--- a/include/xrpl/telemetry/Telemetry.h
+++ b/include/xrpl/telemetry/Telemetry.h
@@ -163,13 +163,16 @@ public:
        /** Path to a CA certificate bundle for TLS verification. */
        std::string tlsCertPath;

-        /** Head-based sampling ratio in [0.0, 1.0]. 1.0 = trace everything.
-            This is a head-based (pre-decision) sampler using
-            TraceIdRatioBasedSampler — the decision to record or drop a
-            trace is made before the root span starts. For post-hoc
-            (tail-based) filtering, see SpanGuard::discard().
+        /** Head-based sampling ratio. Intentionally fixed at 1.0 (sample
+            everything) and NOT read from config. A per-node ratio would let
+            nodes make divergent keep/drop decisions for the same distributed
+            trace, producing broken/partial traces. The ratio sampler is wrapped
+            in a ParentBasedSampler (see Telemetry.cpp) so spans inheriting a
+            remote parent honor the upstream sampled flag. Volume reduction is
+            delegated to the collector's tail sampling; for node-local post-hoc
+            dropping see SpanGuard::discard().
        */
-        double samplingRatio = 1.0;
+        double const samplingRatio = 1.0;

        /** Maximum number of spans per batch export. */
        std::uint32_t batchSize = 512;
--- a/src/libxrpl/telemetry/Telemetry.cpp
+++ b/src/libxrpl/telemetry/Telemetry.cpp
@@ -31,6 +31,7 @@
 #include <opentelemetry/sdk/trace/batch_span_processor_options.h>
 #include <opentelemetry/sdk/trace/processor.h>
 #include <opentelemetry/sdk/trace/sampler.h>
+#include <opentelemetry/sdk/trace/samplers/parent_factory.h>
 #include <opentelemetry/sdk/trace/samplers/trace_id_ratio.h>
 #include <opentelemetry/sdk/trace/tracer_provider.h>
 #include <opentelemetry/sdk/trace/tracer_provider_factory.h>
@@ -301,8 +302,15 @@ public:
            {std::string(attr::networkType), setup_.networkType},  // LCOV_EXCL_LINE
        });

-        // Configure sampler
-        auto sampler = std::make_unique<trace_sdk::TraceIdRatioBasedSampler>(setup_.samplingRatio);
+        // Configure sampler. Head sampling is fixed at 1.0 (sample everything);
+        // setup_.samplingRatio is not config-driven. Wrap the ratio sampler in a
+        // ParentBasedSampler so spans with a remote parent honor the upstream
+        // sampled flag — this keeps keep/drop decisions coherent for a single
+        // distributed trace spanning multiple nodes. Volume reduction is left to
+        // the collector's tail sampling.
+        auto rootSampler =
+            std::make_shared<trace_sdk::TraceIdRatioBasedSampler>(setup_.samplingRatio);
+        auto sampler = trace_sdk::ParentBasedSamplerFactory::Create(std::move(rootSampler));

        // Create TracerProvider
        sdkProvider_ = trace_sdk::TracerProviderFactory::Create(
--- a/src/libxrpl/telemetry/TelemetryConfig.cpp
+++ b/src/libxrpl/telemetry/TelemetryConfig.cpp
@@ -10,7 +10,7 @@
 #include <xrpl/basics/BasicConfig.h>
 #include <xrpl/telemetry/Telemetry.h>

-#include <algorithm>
+#include <chrono>
 #include <cstdint>
 #include <string>

@@ -18,6 +18,45 @@ namespace xrpl::telemetry {

 namespace {

+/** Config key names for the [telemetry] section.
+
+    Each must match the corresponding option documented in
+    cfg/xrpld-example.cfg verbatim. Defined as `char const*` so they
+    pass to Section::valueOr() (which takes `std::string const&`)
+    without an explicit conversion, exactly as a literal would.
+*/
+namespace key {
+constexpr char const* enabled = "enabled";
+constexpr char const* serviceName = "service_name";
+constexpr char const* serviceInstanceId = "service_instance_id";
+constexpr char const* endpoint = "endpoint";
+constexpr char const* useTls = "use_tls";
+constexpr char const* tlsCaCert = "tls_ca_cert";
+constexpr char const* batchSize = "batch_size";
+constexpr char const* batchDelayMs = "batch_delay_ms";
+constexpr char const* maxQueueSize = "max_queue_size";
+constexpr char const* traceTransactions = "trace_transactions";
+constexpr char const* traceConsensus = "trace_consensus";
+constexpr char const* traceRpc = "trace_rpc";
+constexpr char const* tracePeer = "trace_peer";
+constexpr char const* traceLedger = "trace_ledger";
+}  // namespace key
+
+/** Default values applied when a key is absent from the config.
+
+    @note serviceName mirrors SystemParameters' systemName() ("xrpld") but
+    is duplicated here as a literal: the telemetry module deliberately does
+    not link xrpl.libxrpl.protocol, so including SystemParameters.h would
+    introduce an undeclared cross-module dependency.
+*/
+namespace dflt {
+constexpr char const* serviceName = "xrpld";
+constexpr char const* endpoint = "http://localhost:4318/v1/traces";
+constexpr std::uint32_t batchSize = 512u;
+constexpr std::uint32_t batchDelayMs = 5000u;
+constexpr std::uint32_t maxQueueSize = 2048u;
+}  // namespace dflt
+
 /** Derive a human-readable network type label from the numeric network ID.
    @param networkId  The network identifier from [network_id] config.
    @return "mainnet", "testnet", "devnet", or "unknown" for other values.
@@ -49,33 +88,35 @@ setupTelemetry(
 {
    Telemetry::Setup setup;

-    setup.enabled = section.valueOr<int>("enabled", 0) != 0;
-    setup.serviceName = section.valueOr<std::string>("service_name", "xrpld");
+    setup.enabled = section.valueOr<int>(key::enabled, 0) != 0;
+    setup.serviceName = section.valueOr<std::string>(key::serviceName, dflt::serviceName);
    setup.serviceVersion = version;
-    setup.serviceInstanceId = section.valueOr<std::string>("service_instance_id", nodePublicKey);
+    setup.serviceInstanceId = section.valueOr<std::string>(key::serviceInstanceId, nodePublicKey);

-    setup.exporterEndpoint =
-        section.valueOr<std::string>("endpoint", "http://localhost:4318/v1/traces");
+    setup.exporterEndpoint = section.valueOr<std::string>(key::endpoint, dflt::endpoint);

-    setup.useTls = section.valueOr<int>("use_tls", 0) != 0;
-    setup.tlsCertPath = section.valueOr<std::string>("tls_ca_cert", "");
+    setup.useTls = section.valueOr<int>(key::useTls, 0) != 0;
+    setup.tlsCertPath = section.valueOr<std::string>(key::tlsCaCert, "");

-    setup.samplingRatio = section.valueOr<double>("sampling_ratio", 1.0);
-    setup.samplingRatio = std::clamp(setup.samplingRatio, 0.0, 1.0);
+    // Head sampling is intentionally fixed at 1.0 (sample everything) and is
+    // not read from config. A per-node ratio would let nodes make divergent
+    // keep/drop decisions for the same distributed trace, producing broken
+    // traces; volume reduction is delegated to the collector's tail sampling.
+    // setup.samplingRatio is a const member fixed at 1.0; nothing to parse.

-    setup.batchSize = section.valueOr<std::uint32_t>("batch_size", 512u);
-    setup.batchDelay =
-        std::chrono::milliseconds{section.valueOr<std::uint32_t>("batch_delay_ms", 5000u)};
-    setup.maxQueueSize = section.valueOr<std::uint32_t>("max_queue_size", 2048u);
+    setup.batchSize = section.valueOr<std::uint32_t>(key::batchSize, dflt::batchSize);
+    setup.batchDelay = std::chrono::milliseconds{
+        section.valueOr<std::uint32_t>(key::batchDelayMs, dflt::batchDelayMs)};
+    setup.maxQueueSize = section.valueOr<std::uint32_t>(key::maxQueueSize, dflt::maxQueueSize);

    setup.networkId = networkId;
    setup.networkType = networkTypeFromId(networkId);

-    setup.traceTransactions = section.valueOr<int>("trace_transactions", 1) != 0;
-    setup.traceConsensus = section.valueOr<int>("trace_consensus", 1) != 0;
-    setup.traceRpc = section.valueOr<int>("trace_rpc", 1) != 0;
-    setup.tracePeer = section.valueOr<int>("trace_peer", 1) != 0;
-    setup.traceLedger = section.valueOr<int>("trace_ledger", 1) != 0;
+    setup.traceTransactions = section.valueOr<int>(key::traceTransactions, 1) != 0;
+    setup.traceConsensus = section.valueOr<int>(key::traceConsensus, 1) != 0;
+    setup.traceRpc = section.valueOr<int>(key::traceRpc, 1) != 0;
+    setup.tracePeer = section.valueOr<int>(key::tracePeer, 1) != 0;
+    setup.traceLedger = section.valueOr<int>(key::traceLedger, 1) != 0;

    return setup;
 }