mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-10 12:16:49 +00:00
Merge branch 'pratik/otel-phase2-rpc-tracing' into pratik/otel-phase3-tx-tracing
Signed-off-by: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com>
This commit is contained in:
@@ -236,6 +236,8 @@ xrpl.shamap > xrpl.basics
|
||||
xrpl.shamap > xrpl.nodestore
|
||||
xrpl.shamap > xrpl.protocol
|
||||
xrpl.telemetry > xrpl.basics
|
||||
xrpl.telemetry > xrpld.consensus
|
||||
xrpl.telemetry > xrpld.rpc
|
||||
xrpl.tx > xrpl.basics
|
||||
xrpl.tx > xrpl.core
|
||||
xrpl.tx > xrpl.ledger
|
||||
|
||||
@@ -514,12 +514,19 @@ Not every trace needs to be recorded. **Sampling** reduces overhead:
|
||||
### Head Sampling (at trace start)
|
||||
|
||||
```
|
||||
Request arrives → Random 10% chance → Record or skip entire trace
|
||||
Request arrives → Random N% chance → Record or skip entire trace
|
||||
```
|
||||
|
||||
- ✅ Low overhead
|
||||
- ❌ May miss interesting traces
|
||||
|
||||
> **xrpld note**: xrpld intentionally fixes head sampling at 100% (sample
|
||||
> everything) and does not expose a configurable ratio. A per-node ratio
|
||||
> would let different nodes make divergent keep/drop decisions for the same
|
||||
> distributed trace, producing broken/partial traces. xrpld uses a
|
||||
> `ParentBased` sampler so spans with a remote parent honor the upstream
|
||||
> decision. Volume reduction is delegated to collector-side tail sampling.
|
||||
|
||||
### Tail Sampling (after trace completes)
|
||||
|
||||
```
|
||||
|
||||
@@ -53,8 +53,10 @@ public:
|
||||
bool useTls = false;
|
||||
std::string tlsCertPath;
|
||||
|
||||
// Sampling configuration
|
||||
double samplingRatio = 1.0; // 1.0 = 100% sampling
|
||||
// Head sampling: fixed at 1.0 (sample everything), not config-driven.
|
||||
// Keeps trace keep/drop decisions coherent across nodes; volume
|
||||
// reduction is delegated to the collector's tail sampling.
|
||||
double samplingRatio = 1.0;
|
||||
|
||||
// Batch processor settings
|
||||
std::uint32_t batchSize = 512;
|
||||
|
||||
@@ -37,12 +37,11 @@ Add to `cfg/xrpld-example.cfg`:
|
||||
# # Path to CA certificate for TLS (optional)
|
||||
# # tls_ca_cert=/path/to/ca.crt
|
||||
#
|
||||
# # Sampling ratio: 0.0-1.0 (default: 1.0 = 100% sampling)
|
||||
# # Use lower values in production to reduce overhead
|
||||
# # Default: 1.0 (all traces). For production deployments with high
|
||||
# # throughput, 0.1 (10%) is recommended to reduce overhead.
|
||||
# # See Section 7.4.2 for sampling strategy details.
|
||||
# sampling_ratio=0.1
|
||||
# # Head sampling is intentionally fixed at 1.0 (sample everything) and is
|
||||
# # NOT configurable. A per-node head-sampling ratio would let nodes make
|
||||
# # divergent keep/drop decisions for the same distributed trace, producing
|
||||
# # broken/partial traces across the network. Volume reduction is delegated
|
||||
# # to the collector's tail sampling instead. See Section 7.4.2.
|
||||
#
|
||||
# # Batch processor settings
|
||||
# batch_size=512 # Spans per batch (default: 512)
|
||||
@@ -86,7 +85,6 @@ enabled=0
|
||||
| `endpoint` | string | `http://localhost:4318/v1/traces` | OTLP/HTTP collector endpoint |
|
||||
| `use_tls` | bool | `false` | Enable TLS for exporter connection |
|
||||
| `tls_ca_cert` | string | `""` | Path to CA certificate file |
|
||||
| `sampling_ratio` | float | `1.0` | Sampling ratio (0.0-1.0) |
|
||||
| `batch_size` | uint | `512` | Spans per export batch |
|
||||
| `batch_delay_ms` | uint | `5000` | Max delay before sending batch (ms) |
|
||||
| `max_queue_size` | uint | `2048` | Maximum queued spans |
|
||||
@@ -153,13 +151,8 @@ setupTelemetry(
|
||||
setup.useTls = section.value_or("use_tls", false);
|
||||
setup.tlsCertPath = section.value_or("tls_ca_cert", "");
|
||||
|
||||
// Sampling
|
||||
setup.samplingRatio = section.value_or("sampling_ratio", 1.0);
|
||||
if (setup.samplingRatio < 0.0 || setup.samplingRatio > 1.0)
|
||||
{
|
||||
Throw<std::runtime_error>(
|
||||
"telemetry.sampling_ratio must be between 0.0 and 1.0");
|
||||
}
|
||||
// Head sampling is fixed at 1.0 (sample everything) and is not read from
|
||||
// config — see Section 7.4.2. setup.samplingRatio stays at its 1.0 default.
|
||||
|
||||
// Batch processor
|
||||
setup.batchSize = section.value_or("batch_size", 512u);
|
||||
|
||||
@@ -171,7 +171,7 @@ flowchart TB
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph head["Head Sampling (Node)"]
|
||||
hs[Node-level head sampling<br/>configurable, default: 100%<br/>recommended production: 10%]
|
||||
hs[Node-level head sampling<br/>fixed at 100%<br/>not configurable]
|
||||
end
|
||||
|
||||
subgraph tail["Tail Sampling (Collector)"]
|
||||
@@ -197,7 +197,7 @@ flowchart LR
|
||||
|
||||
**Reading the diagram:**
|
||||
|
||||
- **Head Sampling (Node)**: The first filter -- each xrpld node decides whether to sample a trace at creation time (default 100%, recommended 10% in production). This controls the volume leaving the node.
|
||||
- **Head Sampling (Node)**: xrpld pins head sampling at 100% (sample everything) and does not expose a configurable ratio. This is intentional: a per-node ratio would let different nodes make divergent keep/drop decisions for the same distributed trace, producing broken/partial traces. xrpld uses a `ParentBased` sampler so spans inheriting a remote parent honor the upstream decision. Volume reduction is delegated to the collector's tail sampling.
|
||||
- **Tail Sampling (Collector)**: The second filter -- the collector inspects completed traces and applies rules: keep all errors, keep anything slower than 5 seconds, and keep 10% of the remainder.
|
||||
- **Arrow head → tail**: All head-sampled traces flow to the collector, where tail sampling further reduces volume while preserving the most valuable data.
|
||||
- **Final Traces**: The output after both sampling stages; this is what gets stored and queried. The two-stage approach balances cost with debuggability.
|
||||
|
||||
@@ -152,7 +152,7 @@ Span naming follows a hierarchical `<component>.<operation>` convention (e.g., `
|
||||
|
||||
The telemetry code is organized under `include/xrpl/telemetry/` for headers and `src/libxrpl/telemetry/` for implementation. Key principles include RAII-based span management via `SpanGuard` (with `discard()` for dropping unwanted spans), a `FilteringSpanProcessor` that intercepts `OnEnd()` to prevent discarded spans from entering the export pipeline, conditional compilation with `XRPL_ENABLE_TELEMETRY`, and minimal runtime overhead through batch processing and efficient sampling.
|
||||
|
||||
Performance optimization strategies include probabilistic head sampling (10% default), tail-based sampling at the collector for errors and slow traces, batch export to reduce network overhead, and conditional instrumentation that compiles to no-ops when disabled.
|
||||
Performance optimization strategies include head sampling fixed at 100% (intentionally not configurable, so trace keep/drop decisions stay coherent across nodes), tail-based sampling at the collector for errors and slow traces to reduce volume, batch export to reduce network overhead, and conditional instrumentation that compiles to no-ops when disabled.
|
||||
|
||||
➡️ **[Read full Implementation Strategy](./03-implementation-strategy.md)**
|
||||
|
||||
|
||||
@@ -1664,14 +1664,13 @@ validators.txt
|
||||
# Path to a PEM-encoded CA certificate bundle for TLS verification.
|
||||
# Only used when use_tls=1. Default: empty (system CA store).
|
||||
#
|
||||
# sampling_ratio=1.0
|
||||
#
|
||||
# Head-based sampling ratio using TraceIdRatioBasedSampler. The decision
|
||||
# to record or drop a trace is made at span creation time, before the
|
||||
# span starts, based on the trace ID. Values in [0.0, 1.0].
|
||||
# 1.0 = trace everything, 0.1 = sample ~10% of traces. Default: 1.0.
|
||||
# For tail-based (post-hoc) filtering — where you decide to drop a span
|
||||
# after inspecting its content — use SpanGuard::discard() in code.
|
||||
# Head sampling is intentionally fixed at 1.0 (sample everything) and is
|
||||
# not configurable. A per-node sampling ratio would let nodes make
|
||||
# divergent keep/drop decisions for the same distributed trace, producing
|
||||
# broken/partial traces. A ParentBasedSampler ensures spans inheriting a
|
||||
# remote parent honor the upstream decision. Reduce volume at the collector
|
||||
# via tail sampling instead; for node-local post-hoc dropping use
|
||||
# SpanGuard::discard() in code.
|
||||
#
|
||||
# trace_rpc=1
|
||||
#
|
||||
|
||||
179
docs/build/telemetry.md
vendored
179
docs/build/telemetry.md
vendored
@@ -10,22 +10,11 @@ This document explains how to build xrpld with OpenTelemetry distributed tracing
|
||||
- [Install dependencies](#install-dependencies)
|
||||
- [Call CMake](#call-cmake)
|
||||
- [Build](#build)
|
||||
- [Building without telemetry](#building-without-telemetry)
|
||||
- [Runtime Configuration](#runtime-configuration)
|
||||
- [Configuration options](#configuration-options)
|
||||
- [Observability Stack](#observability-stack)
|
||||
- [Start the stack](#start-the-stack)
|
||||
- [Verify the stack](#verify-the-stack)
|
||||
- [View traces in Grafana Explore](#view-traces-in-grafana-explore)
|
||||
- [Running Tests](#running-tests)
|
||||
- [Building without telemetry](#building-without-telemetry)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
- [No traces appear in Grafana](#no-traces-appear-in-grafana)
|
||||
- [Conan lockfile error](#conan-lockfile-error)
|
||||
- [CMake target not found](#cmake-target-not-found)
|
||||
- [Architecture](#architecture)
|
||||
- [Key files](#key-files)
|
||||
- [Span discard mechanism](#span-discard-mechanism)
|
||||
- [Conditional compilation](#conditional-compilation)
|
||||
- [Conditional compilation](#conditional-compilation)
|
||||
|
||||
## Overview
|
||||
|
||||
@@ -106,7 +95,7 @@ You should see in the CMake output:
|
||||
cmake --build . --parallel $(nproc)
|
||||
```
|
||||
|
||||
### Building without telemetry
|
||||
## Building without telemetry
|
||||
|
||||
Omit the `-o telemetry=True` option (or pass `-o telemetry=False`).
|
||||
The `opentelemetry-cpp` dependency will not be downloaded,
|
||||
@@ -114,127 +103,8 @@ the `XRPL_ENABLE_TELEMETRY` preprocessor define will not be set,
|
||||
and all tracing macros will compile to no-ops.
|
||||
The resulting binary is identical to one built before telemetry support was added.
|
||||
|
||||
## Runtime Configuration
|
||||
|
||||
Add a `[telemetry]` section to your `xrpld.cfg` file:
|
||||
|
||||
```ini
|
||||
[telemetry]
|
||||
enabled=1
|
||||
endpoint=http://localhost:4318/v1/traces
|
||||
sampling_ratio=1.0
|
||||
trace_rpc=1
|
||||
trace_transactions=1
|
||||
trace_consensus=1
|
||||
trace_peer=1
|
||||
trace_ledger=1
|
||||
```
|
||||
|
||||
### Configuration options
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| --------------------- | ------ | --------------------------------- | -------------------------------------------------- |
|
||||
| `enabled` | int | `0` | Enable (`1`) or disable (`0`) telemetry at runtime |
|
||||
| `service_name` | string | `xrpld` | Service name reported in traces |
|
||||
| `service_instance_id` | string | node public key | Unique instance identifier |
|
||||
| `endpoint` | string | `http://localhost:4318/v1/traces` | OTLP/HTTP collector endpoint |
|
||||
| `use_tls` | int | `0` | Enable TLS for the exporter connection |
|
||||
| `tls_ca_cert` | string | (empty) | Path to CA certificate for TLS |
|
||||
| `sampling_ratio` | double | `1.0` | Head-based sampling ratio (`0.0` to `1.0`) |
|
||||
| `batch_size` | uint32 | `512` | Maximum spans per export batch |
|
||||
| `batch_delay_ms` | uint32 | `5000` | Maximum delay (ms) before flushing a batch |
|
||||
| `max_queue_size` | uint32 | `2048` | Maximum spans queued in memory |
|
||||
| `trace_rpc` | int | `1` | Enable RPC request tracing |
|
||||
| `trace_transactions` | int | `1` | Enable transaction lifecycle tracing |
|
||||
| `trace_consensus` | int | `1` | Enable consensus round tracing |
|
||||
| `trace_peer` | int | `1` | Enable peer message tracing (high volume) |
|
||||
| `trace_ledger` | int | `1` | Enable ledger close tracing |
|
||||
|
||||
## Observability Stack
|
||||
|
||||
A Docker Compose stack is provided in `docker/telemetry/` with three services:
|
||||
|
||||
| Service | Port | Purpose |
|
||||
| ------------------ | ---------------------------------------------- | --------------------------------------------------- |
|
||||
| **OTel Collector** | `4317` (gRPC), `4318` (HTTP), `13133` (health) | Receives OTLP spans, batches, and forwards to Tempo |
|
||||
| **Tempo** | `3200` (HTTP API) | Trace storage backend |
|
||||
| **Grafana** | `3000` | Dashboards (Tempo pre-configured as datasource) |
|
||||
|
||||
### Start the stack
|
||||
|
||||
```bash
|
||||
docker compose -f docker/telemetry/docker-compose.yml up -d
|
||||
```
|
||||
|
||||
### Verify the stack
|
||||
|
||||
```bash
|
||||
# Collector health
|
||||
curl http://localhost:13133
|
||||
|
||||
# Grafana (Explore -> Tempo for traces)
|
||||
open http://localhost:3000
|
||||
```
|
||||
|
||||
### View traces in Grafana Explore
|
||||
|
||||
1. Open `http://localhost:3000` in a browser.
|
||||
2. Navigate to **Explore** and select the **Tempo** datasource.
|
||||
3. Use **Search** or **TraceQL** to find traces by service name (e.g. `xrpld`).
|
||||
4. Click into any trace to see the span tree and attributes.
|
||||
|
||||
Traced RPC operations produce a span hierarchy like:
|
||||
|
||||
```
|
||||
rpc.request
|
||||
└── rpc.command.server_info (command=server_info, rpc_status=success)
|
||||
```
|
||||
|
||||
Each span includes attributes:
|
||||
|
||||
- `command` — the RPC method name
|
||||
- `version` — API version
|
||||
- `rpc_role` — `admin` or `user`
|
||||
- `rpc_status` — `success` or `error`
|
||||
|
||||
## Running Tests
|
||||
|
||||
Unit tests run with the telemetry-enabled build regardless of whether the
|
||||
observability stack is running. When no collector is available, the exporter
|
||||
silently drops spans with no impact on test results.
|
||||
|
||||
```bash
|
||||
# Run all RPC tests
|
||||
./xrpld --unittest=RPCCall,ServerInfo,AccountTx,LedgerRPC,Transaction --unittest-jobs $(nproc)
|
||||
|
||||
# Run the full test suite
|
||||
./xrpld --unittest --unittest-jobs $(nproc)
|
||||
```
|
||||
|
||||
To generate traces during manual testing, start xrpld in standalone mode:
|
||||
|
||||
```bash
|
||||
./xrpld --conf /path/to/xrpld.cfg --standalone --start
|
||||
```
|
||||
|
||||
Then send RPC requests:
|
||||
|
||||
```bash
|
||||
curl -s -X POST http://127.0.0.1:5005/ \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"method":"server_info","params":[{}]}'
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### No traces appear in Grafana
|
||||
|
||||
1. Confirm the OTel Collector is running: `docker compose -f docker/telemetry/docker-compose.yml ps`
|
||||
2. Check collector logs for errors: `docker compose -f docker/telemetry/docker-compose.yml logs otel-collector`
|
||||
3. Confirm `[telemetry] enabled=1` is set in the xrpld config.
|
||||
4. Confirm `endpoint` points to the correct collector address (`http://localhost:4318/v1/traces`).
|
||||
5. Wait for the batch delay to elapse (default `5000` ms) before checking Grafana Explore.
|
||||
|
||||
### Conan lockfile error
|
||||
|
||||
If you see `ERROR: Requirement 'opentelemetry-cpp/1.26.0' not in lockfile 'requires'`,
|
||||
@@ -249,48 +119,7 @@ Conan-generated toolchain file is being used.
|
||||
The Conan package provides a single umbrella target
|
||||
`opentelemetry-cpp::opentelemetry-cpp` (not individual component targets).
|
||||
|
||||
## Architecture
|
||||
|
||||
### Key files
|
||||
|
||||
| File | Purpose |
|
||||
| --------------------------------------------- | ------------------------------------------------------------ |
|
||||
| `include/xrpl/telemetry/Telemetry.h` | Abstract telemetry interface and `Setup` struct |
|
||||
| `include/xrpl/telemetry/SpanGuard.h` | RAII span guard with `discard()` for dropping unwanted spans |
|
||||
| `include/xrpl/telemetry/DiscardFlag.h` | Thread-local discard flag (zero-dependency header) |
|
||||
| `src/libxrpl/telemetry/Telemetry.cpp` | OTel SDK setup, `FilteringSpanProcessor`, provider lifecycle |
|
||||
| `src/libxrpl/telemetry/TelemetryConfig.cpp` | Config parser (`setupTelemetry()`) |
|
||||
| `src/libxrpl/telemetry/NullTelemetry.cpp` | No-op implementation (used when disabled) |
|
||||
| `src/libxrpl/telemetry/SpanGuard.cpp` | Pimpl implementation for SpanGuard (all OTel types confined) |
|
||||
| `src/xrpld/rpc/detail/ServerHandler.cpp` | RPC entry point instrumentation |
|
||||
| `src/xrpld/rpc/detail/RPCHandler.cpp` | Per-command instrumentation |
|
||||
| `docker/telemetry/docker-compose.yml` | Observability stack (Collector + Tempo + Grafana) |
|
||||
| `docker/telemetry/otel-collector-config.yaml` | OTel Collector pipeline configuration |
|
||||
|
||||
### Span discard mechanism
|
||||
|
||||
`SpanGuard::discard()` allows callers to silently drop spans that turn out to be
|
||||
uninteresting (e.g., failed preflight transactions). This saves both network bandwidth
|
||||
and storage by preventing the span from being exported.
|
||||
|
||||
The mechanism uses a thread-local flag (`tl_discardCurrentSpan` in `DiscardFlag.h`) as a
|
||||
side-channel to the `FilteringSpanProcessor` (in `Telemetry.cpp`):
|
||||
|
||||
1. `SpanGuard::discard()` sets the thread-local flag and calls `Span::End()`
|
||||
2. The OTel SDK calls `FilteringSpanProcessor::OnEnd()` synchronously on the same thread
|
||||
3. The processor checks the flag, clears it, and drops the span before it enters the batch queue
|
||||
|
||||
```cpp
|
||||
SpanGuard guard(telemetry.startSpan("tx.process"));
|
||||
auto result = preflight(tx);
|
||||
if (result != tesSUCCESS)
|
||||
{
|
||||
guard.discard(); // span is dropped, never exported
|
||||
return result;
|
||||
}
|
||||
```
|
||||
|
||||
### Conditional compilation
|
||||
## Conditional compilation
|
||||
|
||||
All OpenTelemetry SDK types are hidden behind the pimpl idiom in `SpanGuard.cpp`.
|
||||
When `XRPL_ENABLE_TELEMETRY` is not defined, `SpanGuard.h` provides an all-inline
|
||||
|
||||
@@ -45,11 +45,16 @@
|
||||
|
||||
Usage examples:
|
||||
|
||||
Span names and attribute keys come from per-module `*SpanNames.h`
|
||||
headers (e.g. RpcSpanNames.h, TxSpanNames.h) as typed compile-time
|
||||
constants — never raw string literals — so the naming spec is
|
||||
enforced at the call site and dashboards stay in sync.
|
||||
|
||||
1. Basic RPC tracing (factory method with category):
|
||||
@code
|
||||
#include <xrpld/rpc/detail/RpcSpanNames.h>
|
||||
using namespace xrpl::telemetry;
|
||||
|
||||
// At the call site (constants from RpcSpanNames.h):
|
||||
auto span = SpanGuard::span(
|
||||
TraceCategory::Rpc, rpc_span::prefix::command, "submit");
|
||||
span.setAttribute(rpc_span::attr::command, "submit");
|
||||
@@ -71,19 +76,22 @@
|
||||
|
||||
3. Cross-thread context propagation:
|
||||
@code
|
||||
#include <xrpld/consensus/ConsensusSpanNames.h>
|
||||
using namespace xrpl::telemetry;
|
||||
|
||||
// Thread A: create span and capture context
|
||||
auto span = SpanGuard::span(
|
||||
TraceCategory::Consensus, seg::consensus, "round");
|
||||
TraceCategory::Consensus, seg::consensus, consensus::span::op::round);
|
||||
auto ctx = span.captureContext();
|
||||
|
||||
// Thread B: create child with captured context
|
||||
auto child = SpanGuard::childSpan("consensus.accept", ctx);
|
||||
auto child = SpanGuard::childSpan(consensus::span::accept, ctx);
|
||||
@endcode
|
||||
|
||||
4. Conditional check (rarely needed — methods are no-ops on null):
|
||||
@code
|
||||
auto span = SpanGuard::span(
|
||||
TraceCategory::Rpc, rpc_span::prefix::rpc, "request");
|
||||
TraceCategory::Rpc, rpc_span::prefix::rpc, rpc_span::op::httpRequest);
|
||||
if (span) {
|
||||
// expensive attribute computation only when active
|
||||
span.setAttribute(rpc_span::attr::requestPayloadSize, computeSize());
|
||||
@@ -93,7 +101,7 @@
|
||||
5. Tail-based filtering via discard():
|
||||
@code
|
||||
auto span = SpanGuard::span(
|
||||
TraceCategory::Transactions, seg::tx, "process");
|
||||
TraceCategory::Transactions, tx_span::prefix::tx, tx_span::op::process);
|
||||
auto result = preflight(tx);
|
||||
if (result != tesSUCCESS) {
|
||||
span.discard(); // drop span, never exported
|
||||
|
||||
@@ -163,13 +163,16 @@ public:
|
||||
/** Path to a CA certificate bundle for TLS verification. */
|
||||
std::string tlsCertPath;
|
||||
|
||||
/** Head-based sampling ratio in [0.0, 1.0]. 1.0 = trace everything.
|
||||
This is a head-based (pre-decision) sampler using
|
||||
TraceIdRatioBasedSampler — the decision to record or drop a
|
||||
trace is made before the root span starts. For post-hoc
|
||||
(tail-based) filtering, see SpanGuard::discard().
|
||||
/** Head-based sampling ratio. Intentionally fixed at 1.0 (sample
|
||||
everything) and NOT read from config. A per-node ratio would let
|
||||
nodes make divergent keep/drop decisions for the same distributed
|
||||
trace, producing broken/partial traces. The ratio sampler is wrapped
|
||||
in a ParentBasedSampler (see Telemetry.cpp) so spans inheriting a
|
||||
remote parent honor the upstream sampled flag. Volume reduction is
|
||||
delegated to the collector's tail sampling; for node-local post-hoc
|
||||
dropping see SpanGuard::discard().
|
||||
*/
|
||||
double samplingRatio = 1.0;
|
||||
double const samplingRatio = 1.0;
|
||||
|
||||
/** Maximum number of spans per batch export. */
|
||||
std::uint32_t batchSize = 512;
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#include <opentelemetry/sdk/trace/batch_span_processor_options.h>
|
||||
#include <opentelemetry/sdk/trace/processor.h>
|
||||
#include <opentelemetry/sdk/trace/sampler.h>
|
||||
#include <opentelemetry/sdk/trace/samplers/parent_factory.h>
|
||||
#include <opentelemetry/sdk/trace/samplers/trace_id_ratio.h>
|
||||
#include <opentelemetry/sdk/trace/tracer_provider.h>
|
||||
#include <opentelemetry/sdk/trace/tracer_provider_factory.h>
|
||||
@@ -301,8 +302,15 @@ public:
|
||||
{std::string(attr::networkType), setup_.networkType}, // LCOV_EXCL_LINE
|
||||
});
|
||||
|
||||
// Configure sampler
|
||||
auto sampler = std::make_unique<trace_sdk::TraceIdRatioBasedSampler>(setup_.samplingRatio);
|
||||
// Configure sampler. Head sampling is fixed at 1.0 (sample everything);
|
||||
// setup_.samplingRatio is not config-driven. Wrap the ratio sampler in a
|
||||
// ParentBasedSampler so spans with a remote parent honor the upstream
|
||||
// sampled flag — this keeps keep/drop decisions coherent for a single
|
||||
// distributed trace spanning multiple nodes. Volume reduction is left to
|
||||
// the collector's tail sampling.
|
||||
auto rootSampler =
|
||||
std::make_shared<trace_sdk::TraceIdRatioBasedSampler>(setup_.samplingRatio);
|
||||
auto sampler = trace_sdk::ParentBasedSamplerFactory::Create(std::move(rootSampler));
|
||||
|
||||
// Create TracerProvider
|
||||
sdkProvider_ = trace_sdk::TracerProviderFactory::Create(
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include <xrpl/basics/BasicConfig.h>
|
||||
#include <xrpl/telemetry/Telemetry.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
@@ -18,6 +18,45 @@ namespace xrpl::telemetry {
|
||||
|
||||
namespace {
|
||||
|
||||
/** Config key names for the [telemetry] section.
|
||||
|
||||
Each must match the corresponding option documented in
|
||||
cfg/xrpld-example.cfg verbatim. Defined as `char const*` so they
|
||||
pass to Section::valueOr() (which takes `std::string const&`)
|
||||
without an explicit conversion, exactly as a literal would.
|
||||
*/
|
||||
namespace key {
|
||||
constexpr char const* enabled = "enabled";
|
||||
constexpr char const* serviceName = "service_name";
|
||||
constexpr char const* serviceInstanceId = "service_instance_id";
|
||||
constexpr char const* endpoint = "endpoint";
|
||||
constexpr char const* useTls = "use_tls";
|
||||
constexpr char const* tlsCaCert = "tls_ca_cert";
|
||||
constexpr char const* batchSize = "batch_size";
|
||||
constexpr char const* batchDelayMs = "batch_delay_ms";
|
||||
constexpr char const* maxQueueSize = "max_queue_size";
|
||||
constexpr char const* traceTransactions = "trace_transactions";
|
||||
constexpr char const* traceConsensus = "trace_consensus";
|
||||
constexpr char const* traceRpc = "trace_rpc";
|
||||
constexpr char const* tracePeer = "trace_peer";
|
||||
constexpr char const* traceLedger = "trace_ledger";
|
||||
} // namespace key
|
||||
|
||||
/** Default values applied when a key is absent from the config.
|
||||
|
||||
@note serviceName mirrors SystemParameters' systemName() ("xrpld") but
|
||||
is duplicated here as a literal: the telemetry module deliberately does
|
||||
not link xrpl.libxrpl.protocol, so including SystemParameters.h would
|
||||
introduce an undeclared cross-module dependency.
|
||||
*/
|
||||
namespace dflt {
|
||||
constexpr char const* serviceName = "xrpld";
|
||||
constexpr char const* endpoint = "http://localhost:4318/v1/traces";
|
||||
constexpr std::uint32_t batchSize = 512u;
|
||||
constexpr std::uint32_t batchDelayMs = 5000u;
|
||||
constexpr std::uint32_t maxQueueSize = 2048u;
|
||||
} // namespace dflt
|
||||
|
||||
/** Derive a human-readable network type label from the numeric network ID.
|
||||
@param networkId The network identifier from [network_id] config.
|
||||
@return "mainnet", "testnet", "devnet", or "unknown" for other values.
|
||||
@@ -49,33 +88,35 @@ setupTelemetry(
|
||||
{
|
||||
Telemetry::Setup setup;
|
||||
|
||||
setup.enabled = section.valueOr<int>("enabled", 0) != 0;
|
||||
setup.serviceName = section.valueOr<std::string>("service_name", "xrpld");
|
||||
setup.enabled = section.valueOr<int>(key::enabled, 0) != 0;
|
||||
setup.serviceName = section.valueOr<std::string>(key::serviceName, dflt::serviceName);
|
||||
setup.serviceVersion = version;
|
||||
setup.serviceInstanceId = section.valueOr<std::string>("service_instance_id", nodePublicKey);
|
||||
setup.serviceInstanceId = section.valueOr<std::string>(key::serviceInstanceId, nodePublicKey);
|
||||
|
||||
setup.exporterEndpoint =
|
||||
section.valueOr<std::string>("endpoint", "http://localhost:4318/v1/traces");
|
||||
setup.exporterEndpoint = section.valueOr<std::string>(key::endpoint, dflt::endpoint);
|
||||
|
||||
setup.useTls = section.valueOr<int>("use_tls", 0) != 0;
|
||||
setup.tlsCertPath = section.valueOr<std::string>("tls_ca_cert", "");
|
||||
setup.useTls = section.valueOr<int>(key::useTls, 0) != 0;
|
||||
setup.tlsCertPath = section.valueOr<std::string>(key::tlsCaCert, "");
|
||||
|
||||
setup.samplingRatio = section.valueOr<double>("sampling_ratio", 1.0);
|
||||
setup.samplingRatio = std::clamp(setup.samplingRatio, 0.0, 1.0);
|
||||
// Head sampling is intentionally fixed at 1.0 (sample everything) and is
|
||||
// not read from config. A per-node ratio would let nodes make divergent
|
||||
// keep/drop decisions for the same distributed trace, producing broken
|
||||
// traces; volume reduction is delegated to the collector's tail sampling.
|
||||
// setup.samplingRatio is a const member fixed at 1.0; nothing to parse.
|
||||
|
||||
setup.batchSize = section.valueOr<std::uint32_t>("batch_size", 512u);
|
||||
setup.batchDelay =
|
||||
std::chrono::milliseconds{section.valueOr<std::uint32_t>("batch_delay_ms", 5000u)};
|
||||
setup.maxQueueSize = section.valueOr<std::uint32_t>("max_queue_size", 2048u);
|
||||
setup.batchSize = section.valueOr<std::uint32_t>(key::batchSize, dflt::batchSize);
|
||||
setup.batchDelay = std::chrono::milliseconds{
|
||||
section.valueOr<std::uint32_t>(key::batchDelayMs, dflt::batchDelayMs)};
|
||||
setup.maxQueueSize = section.valueOr<std::uint32_t>(key::maxQueueSize, dflt::maxQueueSize);
|
||||
|
||||
setup.networkId = networkId;
|
||||
setup.networkType = networkTypeFromId(networkId);
|
||||
|
||||
setup.traceTransactions = section.valueOr<int>("trace_transactions", 1) != 0;
|
||||
setup.traceConsensus = section.valueOr<int>("trace_consensus", 1) != 0;
|
||||
setup.traceRpc = section.valueOr<int>("trace_rpc", 1) != 0;
|
||||
setup.tracePeer = section.valueOr<int>("trace_peer", 1) != 0;
|
||||
setup.traceLedger = section.valueOr<int>("trace_ledger", 1) != 0;
|
||||
setup.traceTransactions = section.valueOr<int>(key::traceTransactions, 1) != 0;
|
||||
setup.traceConsensus = section.valueOr<int>(key::traceConsensus, 1) != 0;
|
||||
setup.traceRpc = section.valueOr<int>(key::traceRpc, 1) != 0;
|
||||
setup.tracePeer = section.valueOr<int>(key::tracePeer, 1) != 0;
|
||||
setup.traceLedger = section.valueOr<int>(key::traceLedger, 1) != 0;
|
||||
|
||||
return setup;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user