Phase 1b: Telemetry core infrastructure

Add the OpenTelemetry telemetry library and supporting infrastructure:

Build system:
- Conan opentelemetry-cpp dependency with OTLP/gRPC exporter
- CMake integration for xrpl_telemetry library target
- Levelization ordering updates

Core library (libxrpl):
- Telemetry class: provider lifecycle, span creation, sampling config
- SpanGuard: RAII span management with attribute/exception helpers
- TelemetryConfig: parse [telemetry] config section
- NullTelemetry: no-op implementation when telemetry is disabled

Application integration:
- Telemetry member in ApplicationImp with start/stop lifecycle
- getTelemetry() interface on Application
- ServiceRegistry telemetry accessor

Docker observability stack:
- OTel Collector, Jaeger, Grafana docker-compose setup
- Collector config with OTLP gRPC receiver and Jaeger exporter

Config and docs:
- Example telemetry config section in xrpld-example.cfg
- Build documentation for telemetry setup

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-02-26 16:17:27 +00:00
parent fd18cf9e01
commit 1c72714e02
17 changed files with 1313 additions and 4 deletions

View File

@@ -32,6 +32,8 @@ libxrpl.server > xrpl.server
libxrpl.shamap > xrpl.basics
libxrpl.shamap > xrpl.protocol
libxrpl.shamap > xrpl.shamap
libxrpl.telemetry > xrpl.basics
libxrpl.telemetry > xrpl.telemetry
libxrpl.tx > xrpl.basics
libxrpl.tx > xrpl.conditions
libxrpl.tx > xrpl.core
@@ -206,6 +208,7 @@ xrpl.server > xrpl.shamap
xrpl.shamap > xrpl.basics
xrpl.shamap > xrpl.nodestore
xrpl.shamap > xrpl.protocol
xrpl.telemetry > xrpl.basics
xrpl.tx > xrpl.basics
xrpl.tx > xrpl.core
xrpl.tx > xrpl.ledger
@@ -224,6 +227,7 @@ xrpld.app > xrpl.rdb
xrpld.app > xrpl.resource
xrpld.app > xrpl.server
xrpld.app > xrpl.shamap
xrpld.app > xrpl.telemetry
xrpld.app > xrpl.tx
xrpld.consensus > xrpl.basics
xrpld.consensus > xrpl.json

View File

@@ -117,6 +117,18 @@ if(rocksdb)
target_link_libraries(xrpl_libs INTERFACE RocksDB::rocksdb)
endif()
# OpenTelemetry distributed tracing (optional).
# When ON, links against opentelemetry-cpp and defines XRPL_ENABLE_TELEMETRY
# so that tracing macros in TracingInstrumentation.h are compiled in.
# When OFF (default), all tracing code compiles to no-ops with zero overhead.
# Enable via: conan install -o telemetry=True, or cmake -Dtelemetry=ON.
option(telemetry "Enable OpenTelemetry tracing" OFF)
if (telemetry)
find_package(opentelemetry-cpp CONFIG REQUIRED)
add_compile_definitions(XRPL_ENABLE_TELEMETRY)
message(STATUS "OpenTelemetry tracing enabled")
endif ()
# Work around changes to Conan recipe for now.
if(TARGET nudb::core)
set(nudb nudb::core)

View File

@@ -1529,3 +1529,46 @@ validators.txt
# set to ssl_verify to 0.
[ssl_verify]
1
#-------------------------------------------------------------------------------
#
# 11. Telemetry (OpenTelemetry Tracing)
#
#-------------------------------------------------------------------------------
#
# Enables distributed tracing via OpenTelemetry. Requires building with
# -DXRPL_ENABLE_TELEMETRY=ON (telemetry Conan option).
#
# [telemetry]
#
# enabled=0
#
# Enable or disable telemetry at runtime. Default: 0 (disabled).
#
# endpoint=http://localhost:4318/v1/traces
#
# The OpenTelemetry Collector endpoint (OTLP/HTTP). Default: http://localhost:4318/v1/traces.
#
# exporter=otlp_http
#
# Exporter type: otlp_http. Default: otlp_http.
#
# sampling_ratio=1.0
#
# Fraction of traces to sample (0.0 to 1.0). Default: 1.0 (all traces).
#
# trace_rpc=1
#
# Enable RPC request tracing. Default: 1.
#
# trace_transactions=1
#
# Enable transaction lifecycle tracing. Default: 1.
#
# trace_consensus=1
#
# Enable consensus round tracing. Default: 1.
#
# trace_peer=0
#
# Enable peer message tracing (high volume). Default: 0.
#

View File

@@ -180,6 +180,17 @@ target_link_libraries(
add_module(xrpl tx)
target_link_libraries(xrpl.libxrpl.tx PUBLIC xrpl.libxrpl.ledger)
# Telemetry module — OpenTelemetry distributed tracing support.
# Sources: include/xrpl/telemetry/ (headers), src/libxrpl/telemetry/ (impl).
# When telemetry=ON, links the Conan-provided umbrella target
# opentelemetry-cpp::opentelemetry-cpp (individual component targets like
# ::api, ::sdk are not available in the Conan package).
add_module(xrpl telemetry)
target_link_libraries(xrpl.libxrpl.telemetry PUBLIC xrpl.libxrpl.basics xrpl.libxrpl.beast)
if (telemetry)
target_link_libraries(xrpl.libxrpl.telemetry PUBLIC opentelemetry-cpp::opentelemetry-cpp)
endif ()
add_library(xrpl.libxrpl)
set_target_properties(xrpl.libxrpl PROPERTIES OUTPUT_NAME xrpl)
@@ -210,6 +221,7 @@ target_link_modules(
resource
server
shamap
telemetry
tx
)

View File

@@ -10,10 +10,13 @@
"rocksdb/10.5.1#4a197eca381a3e5ae8adf8cffa5aacd0%1765850186.86",
"re2/20230301#ca3b241baec15bd31ea9187150e0b333%1765850148.103",
"protobuf/6.32.1#f481fd276fc23a33b85a3ed1e898b693%1765850161.038",
"openssl/3.5.5#05a4ac5b7323f7a329b2db1391d9941f%1769599205.414",
"opentelemetry-cpp/1.18.0#efd9851e173f8a13b9c7d35232de8cf1%1750409186.472",
"openssl/3.5.5#05a4ac5b7323f7a329b2db1391d9941f%1770229825.601",
"nudb/2.0.9#0432758a24204da08fee953ec9ea03cb%1769436073.32",
"nlohmann_json/3.11.3#45828be26eb619a2e04ca517bb7b828d%1701220705.259",
"lz4/1.10.0#59fc63cac7f10fbe8e05c7e62c2f3504%1765850143.914",
"libiconv/1.17#1e65319e945f2d31941a9d28cc13c058%1765842973.492",
"libcurl/8.18.0#364bc3755cb9ef84ed9a7ae9c7efc1c1%1770984390.024",
"libbacktrace/cci.20210118#a7691bfccd8caaf66309df196790a5a1%1765842973.03",
"libarchive/3.8.1#ffee18995c706e02bf96e7a2f7042e0d%1765850144.736",
"jemalloc/5.3.0#e951da9cf599e956cebc117880d2d9f8%1729241615.244",
@@ -30,9 +33,15 @@
"zlib/1.3.1#b8bc2603263cf7eccbd6e17e66b0ed76%1765850150.075",
"strawberryperl/5.32.1.1#707032463aa0620fa17ec0d887f5fe41%1765850165.196",
"protobuf/6.32.1#f481fd276fc23a33b85a3ed1e898b693%1765850161.038",
"pkgconf/2.5.1#93c2051284cba1279494a43a4fcfeae2%1757684701.089",
"opentelemetry-proto/1.4.0#4096a3b05916675ef9628f3ffd571f51%1732731336.11",
"ninja/1.13.2#c8c5dc2a52ed6e4e42a66d75b4717ceb%1764096931.974",
"nasm/2.16.01#31e26f2ee3c4346ecd347911bd126904%1765850144.707",
"msys2/cci.latest#eea83308ad7e9023f7318c60d5a9e6cb%1770199879.083",
"meson/1.10.0#60786758ea978964c24525de19603cf4%1768294926.103",
"m4/1.4.19#70dc8bbb33e981d119d2acc0175cf381%1763158052.846",
"libtool/2.4.7#14e7739cc128bc1623d2ed318008e47e%1755679003.847",
"gnu-config/cci.20210814#466e9d4d7779e1c142443f7ea44b4284%1762363589.329",
"cmake/4.2.0#ae0a44f44a1ef9ab68fd4b3e9a1f8671%1765850153.937",
"cmake/3.31.10#313d16a1aa16bbdb2ca0792467214b76%1765850153.479",
"b2/5.3.3#107c15377719889654eb9a162a673975%1765850144.355",

View File

@@ -22,6 +22,7 @@ class Xrpl(ConanFile):
"rocksdb": [True, False],
"shared": [True, False],
"static": [True, False],
"telemetry": [True, False],
"tests": [True, False],
"unity": [True, False],
"xrpld": [True, False],
@@ -54,6 +55,7 @@ class Xrpl(ConanFile):
"rocksdb": True,
"shared": False,
"static": True,
"telemetry": True,
"tests": False,
"unity": False,
"xrpld": False,
@@ -140,6 +142,10 @@ class Xrpl(ConanFile):
self.requires("jemalloc/5.3.0")
if self.options.rocksdb:
self.requires("rocksdb/10.5.1")
# OpenTelemetry C++ SDK for distributed tracing (optional).
# Provides OTLP/HTTP exporter, batch span processor, and trace API.
if self.options.telemetry:
self.requires("opentelemetry-cpp/1.18.0")
self.requires("xxhash/0.8.3", **transitive_headers_opt)
exports_sources = (
@@ -168,6 +174,7 @@ class Xrpl(ConanFile):
tc.variables["rocksdb"] = self.options.rocksdb
tc.variables["BUILD_SHARED_LIBS"] = self.options.shared
tc.variables["static"] = self.options.static
tc.variables["telemetry"] = self.options.telemetry
tc.variables["unity"] = self.options.unity
tc.variables["xrpld"] = self.options.xrpld
tc.generate()
@@ -220,3 +227,5 @@ class Xrpl(ConanFile):
]
if self.options.rocksdb:
libxrpl.requires.append("rocksdb::librocksdb")
if self.options.telemetry:
libxrpl.requires.append("opentelemetry-cpp::opentelemetry-cpp")

View File

@@ -0,0 +1,60 @@
# Docker Compose stack for rippled OpenTelemetry observability.
#
# Provides three services for local development:
# - otel-collector: receives OTLP traces from rippled, batches and
# forwards them to Jaeger. Listens on ports 4317 (gRPC) and 4318 (HTTP).
# - jaeger: all-in-one tracing backend with UI on port 16686.
# - grafana: dashboards on port 3000, pre-configured with Jaeger datasource.
#
# Usage:
# docker compose -f docker/telemetry/docker-compose.yml up -d
#
# Configure rippled to export traces by adding to xrpld.cfg:
# [telemetry]
# enabled=1
# endpoint=http://localhost:4318/v1/traces
version: "3.8"
services:
otel-collector:
image: otel/opentelemetry-collector-contrib:latest
command: ["--config=/etc/otel-collector-config.yaml"]
ports:
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
- "13133:13133" # Health check
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro
depends_on:
- jaeger
networks:
- rippled-telemetry
jaeger:
image: jaegertracing/all-in-one:latest
environment:
- COLLECTOR_OTLP_ENABLED=true
ports:
- "16686:16686" # Jaeger UI
- "14250:14250" # gRPC
networks:
- rippled-telemetry
grafana:
image: grafana/grafana:latest
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
ports:
- "3000:3000"
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning:ro
depends_on:
- jaeger
networks:
- rippled-telemetry
networks:
rippled-telemetry:
driver: bridge

View File

@@ -0,0 +1,12 @@
# Grafana datasource provisioning for the rippled telemetry stack.
# Auto-configures Jaeger as a trace data source on Grafana startup.
# Access Grafana at http://localhost:3000, then use Explore -> Jaeger
# to browse rippled traces.
apiVersion: 1
datasources:
- name: Jaeger
type: jaeger
access: proxy
url: http://jaeger:16686

View File

@@ -0,0 +1,33 @@
# OpenTelemetry Collector configuration for rippled development.
#
# Pipeline: OTLP receiver -> batch processor -> debug exporter + Jaeger.
# rippled sends traces via OTLP/HTTP to port 4318. The collector batches
# them and forwards to Jaeger via OTLP/gRPC on the Docker network.
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 1s
send_batch_size: 100
exporters:
debug:
verbosity: detailed
otlp/jaeger:
endpoint: jaeger:4317
tls:
insecure: true
service:
pipelines:
traces:
receivers: [otlp]
processors: [batch]
exporters: [debug, otlp/jaeger]

278
docs/build/telemetry.md vendored Normal file
View File

@@ -0,0 +1,278 @@
# OpenTelemetry Tracing for Rippled
This document explains how to build rippled with OpenTelemetry distributed tracing support, configure the runtime telemetry options, and set up the observability backend to view traces.
- [OpenTelemetry Tracing for Rippled](#opentelemetry-tracing-for-rippled)
- [Overview](#overview)
- [Building with Telemetry](#building-with-telemetry)
- [Summary](#summary)
- [Build steps](#build-steps)
- [Install dependencies](#install-dependencies)
- [Call CMake](#call-cmake)
- [Build](#build)
- [Building without telemetry](#building-without-telemetry)
- [Runtime Configuration](#runtime-configuration)
- [Configuration options](#configuration-options)
- [Observability Stack](#observability-stack)
- [Start the stack](#start-the-stack)
- [Verify the stack](#verify-the-stack)
- [View traces in Jaeger](#view-traces-in-jaeger)
- [Running Tests](#running-tests)
- [Troubleshooting](#troubleshooting)
- [No traces appear in Jaeger](#no-traces-appear-in-jaeger)
- [Conan lockfile error](#conan-lockfile-error)
- [CMake target not found](#cmake-target-not-found)
- [Architecture](#architecture)
- [Key files](#key-files)
- [Conditional compilation](#conditional-compilation)
## Overview
Rippled supports optional [OpenTelemetry](https://opentelemetry.io/) distributed tracing.
When enabled, it instruments RPC requests with trace spans that are exported via
OTLP/HTTP to an OpenTelemetry Collector, which forwards them to a tracing backend
such as Jaeger.
Telemetry is **off by default** at both compile time and runtime:
- **Compile time**: The Conan option `telemetry` and CMake option `telemetry` must be set to `True`/`ON`.
When disabled, all tracing macros compile to `((void)0)` with zero overhead.
- **Runtime**: The `[telemetry]` config section must set `enabled=1`.
When disabled at runtime, a no-op implementation is used.
## Building with Telemetry
### Summary
Follow the same instructions as mentioned in [BUILD.md](../../BUILD.md) but with the following changes:
1. Pass `-o telemetry=True` to `conan install` to pull the `opentelemetry-cpp` dependency.
2. CMake will automatically pick up `telemetry=ON` from the Conan-generated toolchain.
3. Build as usual.
---
### Build steps
```bash
cd /path/to/rippled
rm -rf .build
mkdir .build
cd .build
```
#### Install dependencies
The `telemetry` option adds `opentelemetry-cpp/1.18.0` as a dependency.
If the Conan lockfile does not yet include this package, bypass it with `--lockfile=""`.
```bash
conan install .. \
--output-folder . \
--build missing \
--settings build_type=Debug \
-o telemetry=True \
-o tests=True \
-o xrpld=True \
--lockfile=""
```
> **Note**: The first build with telemetry may take longer as `opentelemetry-cpp`
> and its transitive dependencies are compiled from source.
#### Call CMake
The Conan-generated toolchain file sets `telemetry=ON` automatically.
No additional CMake flags are needed beyond the standard ones.
```bash
cmake .. -G Ninja \
-DCMAKE_TOOLCHAIN_FILE:FILEPATH=build/generators/conan_toolchain.cmake \
-DCMAKE_BUILD_TYPE=Debug \
-Dtests=ON -Dxrpld=ON
```
You should see in the CMake output:
```
-- OpenTelemetry tracing enabled
```
#### Build
```bash
cmake --build . --parallel $(nproc)
```
### Building without telemetry
Omit the `-o telemetry=True` option (or pass `-o telemetry=False`).
The `opentelemetry-cpp` dependency will not be downloaded,
the `XRPL_ENABLE_TELEMETRY` preprocessor define will not be set,
and all tracing macros will compile to no-ops.
The resulting binary is identical to one built before telemetry support was added.
## Runtime Configuration
Add a `[telemetry]` section to your `xrpld.cfg` file:
```ini
[telemetry]
enabled=1
service_name=rippled
endpoint=http://localhost:4318/v1/traces
sampling_ratio=1.0
trace_rpc=1
trace_transactions=1
trace_consensus=1
trace_peer=0
```
### Configuration options
| Option | Type | Default | Description |
| --------------------- | ------ | --------------------------------- | -------------------------------------------------- |
| `enabled` | int | `0` | Enable (`1`) or disable (`0`) telemetry at runtime |
| `service_name` | string | `rippled` | Service name reported in traces |
| `service_instance_id` | string | node public key | Unique instance identifier |
| `exporter` | string | `otlp_http` | Exporter type |
| `endpoint` | string | `http://localhost:4318/v1/traces` | OTLP/HTTP collector endpoint |
| `use_tls` | int | `0` | Enable TLS for the exporter connection |
| `tls_ca_cert` | string | (empty) | Path to CA certificate for TLS |
| `sampling_ratio` | double | `1.0` | Fraction of traces to sample (`0.0` to `1.0`) |
| `batch_size` | uint32 | `512` | Maximum spans per export batch |
| `batch_delay_ms` | uint32 | `5000` | Maximum delay (ms) before flushing a batch |
| `max_queue_size` | uint32 | `2048` | Maximum spans queued in memory |
| `trace_rpc` | int | `1` | Enable RPC request tracing |
| `trace_transactions` | int | `1` | Enable transaction lifecycle tracing |
| `trace_consensus` | int | `1` | Enable consensus round tracing |
| `trace_peer` | int | `0` | Enable peer message tracing (high volume) |
| `trace_ledger` | int | `1` | Enable ledger close tracing |
## Observability Stack
A Docker Compose stack is provided in `docker/telemetry/` with three services:
| Service | Port | Purpose |
| ------------------ | ---------------------------------------------- | ---------------------------------------------------- |
| **OTel Collector** | `4317` (gRPC), `4318` (HTTP), `13133` (health) | Receives OTLP spans, batches, and forwards to Jaeger |
| **Jaeger** | `16686` (UI) | Trace storage and visualization |
| **Grafana** | `3000` | Dashboards (Jaeger pre-configured as datasource) |
### Start the stack
```bash
docker compose -f docker/telemetry/docker-compose.yml up -d
```
### Verify the stack
```bash
# Collector health
curl http://localhost:13133
# Jaeger UI
open http://localhost:16686
# Grafana
open http://localhost:3000
```
### View traces in Jaeger
1. Open `http://localhost:16686` in a browser.
2. Select the service name (e.g. `rippled`) from the **Service** dropdown.
3. Click **Find Traces**.
4. Click into any trace to see the span tree and attributes.
Traced RPC operations produce a span hierarchy like:
```
rpc.request
└── rpc.command.server_info (xrpl.rpc.command=server_info, xrpl.rpc.status=success)
```
Each span includes attributes:
- `xrpl.rpc.command` — the RPC method name
- `xrpl.rpc.version` — API version
- `xrpl.rpc.role``admin` or `user`
- `xrpl.rpc.status``success` or `error`
## Running Tests
Unit tests run with the telemetry-enabled build regardless of whether the
observability stack is running. When no collector is available, the exporter
silently drops spans with no impact on test results.
```bash
# Run all RPC tests
./xrpld --unittest=RPCCall,ServerInfo,AccountTx,LedgerRPC,Transaction --unittest-jobs $(nproc)
# Run the full test suite
./xrpld --unittest --unittest-jobs $(nproc)
```
To generate traces during manual testing, start rippled in standalone mode:
```bash
./xrpld --conf /path/to/xrpld.cfg --standalone --start
```
Then send RPC requests:
```bash
curl -s -X POST http://127.0.0.1:5005/ \
-H "Content-Type: application/json" \
-d '{"method":"server_info","params":[{}]}'
```
## Troubleshooting
### No traces appear in Jaeger
1. Confirm the OTel Collector is running: `docker compose -f docker/telemetry/docker-compose.yml ps`
2. Check collector logs for errors: `docker compose -f docker/telemetry/docker-compose.yml logs otel-collector`
3. Confirm `[telemetry] enabled=1` is set in the rippled config.
4. Confirm `endpoint` points to the correct collector address (`http://localhost:4318/v1/traces`).
5. Wait for the batch delay to elapse (default `5000` ms) before checking Jaeger.
### Conan lockfile error
If you see `ERROR: Requirement 'opentelemetry-cpp/1.18.0' not in lockfile 'requires'`,
the lockfile was generated without the telemetry dependency.
Pass `--lockfile=""` to bypass the lockfile, or regenerate it with telemetry enabled.
### CMake target not found
If CMake reports that `opentelemetry-cpp` targets are not found,
ensure you ran `conan install` with `-o telemetry=True` and that the
Conan-generated toolchain file is being used.
The Conan package provides a single umbrella target
`opentelemetry-cpp::opentelemetry-cpp` (not individual component targets).
## Architecture
### Key files
| File | Purpose |
| ---------------------------------------------- | ----------------------------------------------------------- |
| `include/xrpl/telemetry/Telemetry.h` | Abstract telemetry interface and `Setup` struct |
| `include/xrpl/telemetry/SpanGuard.h` | RAII span guard (activates scope, ends span on destruction) |
| `src/libxrpl/telemetry/Telemetry.cpp` | OTel-backed implementation (`TelemetryImpl`) |
| `src/libxrpl/telemetry/TelemetryConfig.cpp` | Config parser (`setup_Telemetry()`) |
| `src/libxrpl/telemetry/NullTelemetry.cpp` | No-op implementation (used when disabled) |
| `src/xrpld/telemetry/TracingInstrumentation.h` | Convenience macros (`XRPL_TRACE_RPC`, etc.) |
| `src/xrpld/rpc/detail/ServerHandler.cpp` | RPC entry point instrumentation |
| `src/xrpld/rpc/detail/RPCHandler.cpp` | Per-command instrumentation |
| `docker/telemetry/docker-compose.yml` | Observability stack (Collector + Jaeger + Grafana) |
| `docker/telemetry/otel-collector-config.yaml` | OTel Collector pipeline configuration |
### Conditional compilation
All OpenTelemetry SDK headers are guarded behind `#ifdef XRPL_ENABLE_TELEMETRY`.
The instrumentation macros in `TracingInstrumentation.h` compile to `((void)0)` when
the define is absent.
At runtime, if `enabled=0` is set in config (or the section is omitted), a
`NullTelemetry` implementation is used that returns no-op spans.
This two-layer approach ensures zero overhead when telemetry is not wanted.

View File

@@ -19,6 +19,9 @@ class Manager;
namespace perf {
class PerfLog;
}
namespace telemetry {
class Telemetry;
}
// This is temporary until we migrate all code to use ServiceRegistry.
class Application;
@@ -205,6 +208,9 @@ public:
virtual perf::PerfLog&
getPerfLog() = 0;
virtual telemetry::Telemetry&
getTelemetry() = 0;
// Configuration and state
virtual bool
isStopping() const = 0;

View File

@@ -0,0 +1,155 @@
#pragma once
/** RAII guard for OpenTelemetry trace spans.
Wraps an OTel Span and Scope together. On construction, the span is
activated on the current thread's context (via Scope). On destruction,
the span is ended and the previous context is restored.
Used by the XRPL_TRACE_* macros in TracingInstrumentation.h. Can also
be stored in std::optional for conditional tracing (move-constructible).
Only compiled when XRPL_ENABLE_TELEMETRY is defined.
*/
#ifdef XRPL_ENABLE_TELEMETRY
#include <opentelemetry/context/runtime_context.h>
#include <opentelemetry/nostd/shared_ptr.h>
#include <opentelemetry/trace/scope.h>
#include <opentelemetry/trace/span.h>
#include <exception>
#include <string_view>
namespace xrpl {
namespace telemetry {
/** RAII wrapper that activates a span on construction and ends it on
destruction. Non-copyable but move-constructible so it can be held
in std::optional for conditional tracing.
*/
class SpanGuard
{
/** The OTel span being guarded. Set to nullptr after move. */
opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span> span_;
/** Scope that activates span_ on the current thread's context stack. */
opentelemetry::trace::Scope scope_;
public:
/** Construct a guard that activates @p span on the current context.
@param span The span to guard. Ended in the destructor.
*/
explicit SpanGuard(opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span> span)
: span_(std::move(span)), scope_(span_)
{
}
/** Non-copyable. Move-constructible to support std::optional.
The move constructor creates a new Scope from the transferred span,
because Scope is not movable.
*/
SpanGuard(SpanGuard const&) = delete;
SpanGuard&
operator=(SpanGuard const&) = delete;
SpanGuard(SpanGuard&& other) noexcept : span_(std::move(other.span_)), scope_(span_)
{
other.span_ = nullptr;
}
SpanGuard&
operator=(SpanGuard&&) = delete;
~SpanGuard()
{
if (span_)
span_->End();
}
/** @return A mutable reference to the underlying span. */
opentelemetry::trace::Span&
span()
{
return *span_;
}
/** @return A const reference to the underlying span. */
opentelemetry::trace::Span const&
span() const
{
return *span_;
}
/** Mark the span status as OK. */
void
setOk()
{
span_->SetStatus(opentelemetry::trace::StatusCode::kOk);
}
/** Set an explicit status code on the span.
@param code The OTel status code.
@param description Optional human-readable status description.
*/
void
setStatus(opentelemetry::trace::StatusCode code, std::string_view description = "")
{
span_->SetStatus(code, std::string(description));
}
/** Set a key-value attribute on the span.
@param key Attribute name (e.g. "xrpl.rpc.command").
@param value Attribute value (string, int, bool, etc.).
*/
template <typename T>
void
setAttribute(std::string_view key, T&& value)
{
span_->SetAttribute(
opentelemetry::nostd::string_view(key.data(), key.size()), std::forward<T>(value));
}
/** Add a named event to the span's timeline.
@param name Event name.
*/
void
addEvent(std::string_view name)
{
span_->AddEvent(std::string(name));
}
/** Record an exception as a span event following OTel semantic
conventions, and mark the span status as error.
@param e The exception to record.
*/
void
recordException(std::exception const& e)
{
span_->AddEvent(
"exception",
{{"exception.type", "std::exception"}, {"exception.message", std::string(e.what())}});
span_->SetStatus(opentelemetry::trace::StatusCode::kError, e.what());
}
/** Return the current OTel context.
Useful for creating child spans on a different thread by passing
this context to Telemetry::startSpan(name, parentContext).
*/
opentelemetry::context::Context
context() const
{
return opentelemetry::context::RuntimeContext::GetCurrent();
}
};
} // namespace telemetry
} // namespace xrpl
#endif // XRPL_ENABLE_TELEMETRY

View File

@@ -0,0 +1,209 @@
#pragma once
/** Abstract interface for OpenTelemetry distributed tracing.
Provides the Telemetry base class that all components use to create trace
spans. Two implementations exist:
- TelemetryImpl (Telemetry.cpp): real OTel SDK integration, compiled
only when XRPL_ENABLE_TELEMETRY is defined and enabled at runtime.
- NullTelemetry (NullTelemetry.cpp): no-op stub used when telemetry is
disabled at compile time or runtime.
The Setup struct holds all configuration parsed from the [telemetry]
section of xrpld.cfg. See TelemetryConfig.cpp for the parser and
cfg/xrpld-example.cfg for the available options.
OTel SDK headers are conditionally included behind XRPL_ENABLE_TELEMETRY
so that builds without telemetry have zero dependency on opentelemetry-cpp.
*/
#include <xrpl/basics/BasicConfig.h>
#include <xrpl/beast/utility/Journal.h>
#include <chrono>
#include <memory>
#include <string>
#include <string_view>
#ifdef XRPL_ENABLE_TELEMETRY
#include <opentelemetry/context/context.h>
#include <opentelemetry/nostd/shared_ptr.h>
#include <opentelemetry/trace/span.h>
#include <opentelemetry/trace/tracer.h>
#endif
namespace xrpl {
namespace telemetry {
class Telemetry
{
public:
/** Configuration parsed from the [telemetry] section of xrpld.cfg.
All fields have sensible defaults so the section can be minimal
or omitted entirely. See TelemetryConfig.cpp for the parser.
*/
struct Setup
{
/** Master switch: true to enable tracing at runtime. */
bool enabled = false;
/** OTel resource attribute `service.name`. */
std::string serviceName = "rippled";
/** OTel resource attribute `service.version` (set from BuildInfo). */
std::string serviceVersion;
/** OTel resource attribute `service.instance.id` (defaults to node
public key). */
std::string serviceInstanceId;
/** Exporter type: currently only "otlp_http" is supported. */
std::string exporterType = "otlp_http";
/** OTLP/HTTP endpoint URL where spans are sent. */
std::string exporterEndpoint = "http://localhost:4318/v1/traces";
/** Whether to use TLS for the exporter connection. */
bool useTls = false;
/** Path to a CA certificate bundle for TLS verification. */
std::string tlsCertPath;
/** Head-based sampling ratio in [0.0, 1.0]. 1.0 = trace everything. */
double samplingRatio = 1.0;
/** Maximum number of spans per batch export. */
std::uint32_t batchSize = 512;
/** Delay between batch exports. */
std::chrono::milliseconds batchDelay{5000};
/** Maximum number of spans queued before dropping. */
std::uint32_t maxQueueSize = 2048;
/** Network identifier, added as an OTel resource attribute. */
std::uint32_t networkId = 0;
/** Network type label (e.g. "mainnet", "testnet", "devnet"). */
std::string networkType = "mainnet";
/** Enable tracing for transaction processing. */
bool traceTransactions = true;
/** Enable tracing for consensus rounds. */
bool traceConsensus = true;
/** Enable tracing for RPC request handling. */
bool traceRpc = true;
/** Enable tracing for peer-to-peer messages (disabled by default
due to high volume). */
bool tracePeer = false;
/** Enable tracing for ledger close/accept. */
bool traceLedger = true;
};
virtual ~Telemetry() = default;
/** Initialize the tracing pipeline (exporter, processor, provider).
Call after construction.
*/
virtual void
start() = 0;
/** Flush pending spans and shut down the tracing pipeline.
Call before destruction.
*/
virtual void
stop() = 0;
/** @return true if this instance is actively exporting spans. */
virtual bool
isEnabled() const = 0;
/** @return true if transaction processing should be traced. */
virtual bool
shouldTraceTransactions() const = 0;
/** @return true if consensus rounds should be traced. */
virtual bool
shouldTraceConsensus() const = 0;
/** @return true if RPC request handling should be traced. */
virtual bool
shouldTraceRpc() const = 0;
/** @return true if peer-to-peer messages should be traced. */
virtual bool
shouldTracePeer() const = 0;
#ifdef XRPL_ENABLE_TELEMETRY
/** Get or create a named tracer instance.
@param name Tracer name used to identify the instrumentation library.
@return A shared pointer to the Tracer.
*/
virtual opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer>
getTracer(std::string_view name = "rippled") = 0;
/** Start a new span on the current thread's context.
The span becomes a child of the current active span (if any) via
OpenTelemetry's context propagation.
@param name Span name (typically "rpc.command.<cmd>").
@param kind The span kind (defaults to kInternal).
@return A shared pointer to the new Span.
*/
virtual opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span>
startSpan(
std::string_view name,
opentelemetry::trace::SpanKind kind = opentelemetry::trace::SpanKind::kInternal) = 0;
/** Start a new span with an explicit parent context.
Use this overload when the parent span is not on the current
thread's context stack (e.g. cross-thread trace propagation).
@param name Span name.
@param parentContext The parent span's context.
@param kind The span kind (defaults to kInternal).
@return A shared pointer to the new Span.
*/
virtual opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span>
startSpan(
std::string_view name,
opentelemetry::context::Context const& parentContext,
opentelemetry::trace::SpanKind kind = opentelemetry::trace::SpanKind::kInternal) = 0;
#endif
};
/** Create a Telemetry instance.
Returns a TelemetryImpl when setup.enabled is true, or a
NullTelemetry no-op stub otherwise.
@param setup Configuration from the [telemetry] config section.
@param journal Journal for log output during initialization.
*/
std::unique_ptr<Telemetry>
make_Telemetry(Telemetry::Setup const& setup, beast::Journal journal);
/** Parse the [telemetry] config section into a Setup struct.
@param section The [telemetry] config section.
@param nodePublicKey Node public key, used as default instance ID.
@param version Build version string.
@return A populated Setup struct with defaults for missing values.
*/
Telemetry::Setup
setup_Telemetry(
Section const& section,
std::string const& nodePublicKey,
std::string const& version);
} // namespace telemetry
} // namespace xrpl

View File

@@ -0,0 +1,121 @@
/** No-op implementation of the Telemetry interface.
Always compiled (regardless of XRPL_ENABLE_TELEMETRY). Provides the
make_Telemetry() factory when telemetry is compiled out (#ifndef), which
unconditionally returns a NullTelemetry that does nothing.
When XRPL_ENABLE_TELEMETRY IS defined, the OTel virtual methods
(getTracer, startSpan) return noop tracers/spans. The make_Telemetry()
factory in this file is not used in that case -- Telemetry.cpp provides
its own factory that can return the real TelemetryImpl.
*/
#include <xrpl/telemetry/Telemetry.h>
#ifdef XRPL_ENABLE_TELEMETRY
#include <opentelemetry/trace/noop.h>
#endif
namespace xrpl {
namespace telemetry {
namespace {
/** No-op Telemetry that returns immediately from every method.
Used as the sole implementation when XRPL_ENABLE_TELEMETRY is not
defined, or as a fallback when it is defined but enabled=0.
*/
class NullTelemetry : public Telemetry
{
/** Retained configuration (unused, kept for diagnostic access). */
Setup const setup_;
public:
explicit NullTelemetry(Setup const& setup) : setup_(setup)
{
}
void
start() override
{
}
void
stop() override
{
}
bool
isEnabled() const override
{
return false;
}
bool
shouldTraceTransactions() const override
{
return false;
}
bool
shouldTraceConsensus() const override
{
return false;
}
bool
shouldTraceRpc() const override
{
return false;
}
bool
shouldTracePeer() const override
{
return false;
}
#ifdef XRPL_ENABLE_TELEMETRY
opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer>
getTracer(std::string_view) override
{
static auto noopTracer = opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer>(
new opentelemetry::trace::NoopTracer());
return noopTracer;
}
opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span>
startSpan(std::string_view, opentelemetry::trace::SpanKind) override
{
return opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span>(
new opentelemetry::trace::NoopSpan(nullptr));
}
opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span>
startSpan(
std::string_view,
opentelemetry::context::Context const&,
opentelemetry::trace::SpanKind) override
{
return opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span>(
new opentelemetry::trace::NoopSpan(nullptr));
}
#endif
};
} // namespace
/** Factory used when XRPL_ENABLE_TELEMETRY is not defined.
Unconditionally returns a NullTelemetry instance.
*/
#ifndef XRPL_ENABLE_TELEMETRY
std::unique_ptr<Telemetry>
make_Telemetry(Telemetry::Setup const& setup, beast::Journal)
{
return std::make_unique<NullTelemetry>(setup);
}
#endif
} // namespace telemetry
} // namespace xrpl

View File

@@ -0,0 +1,279 @@
/** OpenTelemetry SDK implementation of the Telemetry interface.
Compiled only when XRPL_ENABLE_TELEMETRY is defined (via CMake
telemetry=ON). Contains:
- TelemetryImpl: configures the OTel SDK with an OTLP/HTTP exporter,
batch span processor, trace-ID-ratio sampler, and resource attributes.
- NullTelemetryOtel: no-op fallback used when telemetry is compiled in
but disabled at runtime (enabled=0 in config).
- make_Telemetry(): factory that selects the appropriate implementation.
*/
#ifdef XRPL_ENABLE_TELEMETRY
#include <xrpl/basics/Log.h>
#include <xrpl/telemetry/Telemetry.h>
#include <opentelemetry/exporters/otlp/otlp_http_exporter_factory.h>
#include <opentelemetry/exporters/otlp/otlp_http_exporter_options.h>
#include <opentelemetry/sdk/resource/semantic_conventions.h>
#include <opentelemetry/sdk/trace/batch_span_processor_factory.h>
#include <opentelemetry/sdk/trace/batch_span_processor_options.h>
#include <opentelemetry/sdk/trace/sampler.h>
#include <opentelemetry/sdk/trace/samplers/trace_id_ratio.h>
#include <opentelemetry/sdk/trace/tracer_provider.h>
#include <opentelemetry/sdk/trace/tracer_provider_factory.h>
#include <opentelemetry/trace/noop.h>
#include <opentelemetry/trace/provider.h>
namespace xrpl {
namespace telemetry {
namespace {
namespace trace_api = opentelemetry::trace;
namespace trace_sdk = opentelemetry::sdk::trace;
namespace otlp_http = opentelemetry::exporter::otlp;
namespace resource = opentelemetry::sdk::resource;
/** No-op implementation used when XRPL_ENABLE_TELEMETRY is defined but
setup.enabled is false at runtime.
Lives in the anonymous namespace so there is no ODR conflict with the
NullTelemetry in NullTelemetry.cpp.
*/
class NullTelemetryOtel : public Telemetry
{
/** Retained configuration (unused, kept for diagnostic access). */
Setup const setup_;
public:
explicit NullTelemetryOtel(Setup const& setup) : setup_(setup)
{
}
void
start() override
{
}
void
stop() override
{
}
bool
isEnabled() const override
{
return false;
}
bool
shouldTraceTransactions() const override
{
return false;
}
bool
shouldTraceConsensus() const override
{
return false;
}
bool
shouldTraceRpc() const override
{
return false;
}
bool
shouldTracePeer() const override
{
return false;
}
opentelemetry::nostd::shared_ptr<trace_api::Tracer>
getTracer(std::string_view) override
{
static auto noopTracer =
opentelemetry::nostd::shared_ptr<trace_api::Tracer>(new trace_api::NoopTracer());
return noopTracer;
}
opentelemetry::nostd::shared_ptr<trace_api::Span>
startSpan(std::string_view, trace_api::SpanKind) override
{
return opentelemetry::nostd::shared_ptr<trace_api::Span>(new trace_api::NoopSpan(nullptr));
}
opentelemetry::nostd::shared_ptr<trace_api::Span>
startSpan(std::string_view, opentelemetry::context::Context const&, trace_api::SpanKind)
override
{
return opentelemetry::nostd::shared_ptr<trace_api::Span>(new trace_api::NoopSpan(nullptr));
}
};
/** Full OTel SDK implementation that exports trace spans via OTLP/HTTP.
Configures an OTLP/HTTP exporter, batch span processor,
TraceIdRatioBasedSampler, and resource attributes on start().
*/
class TelemetryImpl : public Telemetry
{
/** Configuration from the [telemetry] config section. */
Setup const setup_;
/** Journal used for log output during start/stop. */
beast::Journal const journal_;
/** The SDK TracerProvider that owns the export pipeline.
Held as std::shared_ptr so we can call ForceFlush() on shutdown.
Wrapped in a nostd::shared_ptr when registered as the global provider.
*/
std::shared_ptr<trace_sdk::TracerProvider> sdkProvider_;
public:
TelemetryImpl(Setup const& setup, beast::Journal journal) : setup_(setup), journal_(journal)
{
}
void
start() override
{
JLOG(journal_.info()) << "Telemetry starting: endpoint=" << setup_.exporterEndpoint
<< " sampling=" << setup_.samplingRatio;
// Configure OTLP HTTP exporter
otlp_http::OtlpHttpExporterOptions exporterOpts;
exporterOpts.url = setup_.exporterEndpoint;
if (setup_.useTls)
exporterOpts.ssl_ca_cert_path = setup_.tlsCertPath;
auto exporter = otlp_http::OtlpHttpExporterFactory::Create(exporterOpts);
// Configure batch processor
trace_sdk::BatchSpanProcessorOptions processorOpts;
processorOpts.max_queue_size = setup_.maxQueueSize;
processorOpts.schedule_delay_millis = std::chrono::milliseconds(setup_.batchDelay);
processorOpts.max_export_batch_size = setup_.batchSize;
auto processor =
trace_sdk::BatchSpanProcessorFactory::Create(std::move(exporter), processorOpts);
// Configure resource attributes
auto resourceAttrs = resource::Resource::Create({
{resource::SemanticConventions::kServiceName, setup_.serviceName},
{resource::SemanticConventions::kServiceVersion, setup_.serviceVersion},
{resource::SemanticConventions::kServiceInstanceId, setup_.serviceInstanceId},
{"xrpl.network.id", static_cast<int64_t>(setup_.networkId)},
{"xrpl.network.type", setup_.networkType},
});
// Configure sampler
auto sampler = std::make_unique<trace_sdk::TraceIdRatioBasedSampler>(setup_.samplingRatio);
// Create TracerProvider
sdkProvider_ = trace_sdk::TracerProviderFactory::Create(
std::move(processor), resourceAttrs, std::move(sampler));
// Set as global provider
trace_api::Provider::SetTracerProvider(
opentelemetry::nostd::shared_ptr<trace_api::TracerProvider>(sdkProvider_));
JLOG(journal_.info()) << "Telemetry started successfully";
}
void
stop() override
{
JLOG(journal_.info()) << "Telemetry stopping";
if (sdkProvider_)
{
// Force flush before shutdown
sdkProvider_->ForceFlush();
sdkProvider_.reset();
trace_api::Provider::SetTracerProvider(
opentelemetry::nostd::shared_ptr<trace_api::TracerProvider>(
new trace_api::NoopTracerProvider()));
}
JLOG(journal_.info()) << "Telemetry stopped";
}
bool
isEnabled() const override
{
return true;
}
bool
shouldTraceTransactions() const override
{
return setup_.traceTransactions;
}
bool
shouldTraceConsensus() const override
{
return setup_.traceConsensus;
}
bool
shouldTraceRpc() const override
{
return setup_.traceRpc;
}
bool
shouldTracePeer() const override
{
return setup_.tracePeer;
}
opentelemetry::nostd::shared_ptr<trace_api::Tracer>
getTracer(std::string_view name) override
{
if (!sdkProvider_)
return trace_api::Provider::GetTracerProvider()->GetTracer(std::string(name));
return sdkProvider_->GetTracer(std::string(name));
}
opentelemetry::nostd::shared_ptr<trace_api::Span>
startSpan(std::string_view name, trace_api::SpanKind kind) override
{
auto tracer = getTracer("rippled");
trace_api::StartSpanOptions opts;
opts.kind = kind;
return tracer->StartSpan(std::string(name), opts);
}
opentelemetry::nostd::shared_ptr<trace_api::Span>
startSpan(
std::string_view name,
opentelemetry::context::Context const& parentContext,
trace_api::SpanKind kind) override
{
auto tracer = getTracer("rippled");
trace_api::StartSpanOptions opts;
opts.kind = kind;
opts.parent = parentContext;
return tracer->StartSpan(std::string(name), opts);
}
};
} // namespace
std::unique_ptr<Telemetry>
make_Telemetry(Telemetry::Setup const& setup, beast::Journal journal)
{
if (setup.enabled)
return std::make_unique<TelemetryImpl>(setup, journal);
return std::make_unique<NullTelemetryOtel>(setup);
}
} // namespace telemetry
} // namespace xrpl
#endif // XRPL_ENABLE_TELEMETRY

View File

@@ -0,0 +1,52 @@
/** Parser for the [telemetry] section of xrpld.cfg.
Reads configuration values from the config file and populates a
Telemetry::Setup struct. All options have sensible defaults so the
section can be minimal or omitted entirely.
See cfg/xrpld-example.cfg for the full list of available options.
*/
#include <xrpl/telemetry/Telemetry.h>
namespace xrpl {
namespace telemetry {
Telemetry::Setup
setup_Telemetry(
Section const& section,
std::string const& nodePublicKey,
std::string const& version)
{
Telemetry::Setup setup;
setup.enabled = section.value_or<int>("enabled", 0) != 0;
setup.serviceName = section.value_or<std::string>("service_name", "rippled");
setup.serviceVersion = version;
setup.serviceInstanceId = section.value_or<std::string>("service_instance_id", nodePublicKey);
setup.exporterType = section.value_or<std::string>("exporter", "otlp_http");
setup.exporterEndpoint =
section.value_or<std::string>("endpoint", "http://localhost:4318/v1/traces");
setup.useTls = section.value_or<int>("use_tls", 0) != 0;
setup.tlsCertPath = section.value_or<std::string>("tls_ca_cert", "");
setup.samplingRatio = section.value_or<double>("sampling_ratio", 1.0);
setup.batchSize = section.value_or<std::uint32_t>("batch_size", 512u);
setup.batchDelay =
std::chrono::milliseconds{section.value_or<std::uint32_t>("batch_delay_ms", 5000u)};
setup.maxQueueSize = section.value_or<std::uint32_t>("max_queue_size", 2048u);
setup.traceTransactions = section.value_or<int>("trace_transactions", 1) != 0;
setup.traceConsensus = section.value_or<int>("trace_consensus", 1) != 0;
setup.traceRpc = section.value_or<int>("trace_rpc", 1) != 0;
setup.tracePeer = section.value_or<int>("trace_peer", 0) != 0;
setup.traceLedger = section.value_or<int>("trace_ledger", 1) != 0;
return setup;
}
} // namespace telemetry
} // namespace xrpl

View File

@@ -31,7 +31,6 @@
#include <xrpld/shamap/NodeFamily.h>
#include <xrpl/basics/ByteUtilities.h>
#include <xrpl/basics/MallocTrim.h>
#include <xrpl/basics/ResolverAsio.h>
#include <xrpl/basics/random.h>
#include <xrpl/beast/asio/io_latency_probe.h>
@@ -52,6 +51,7 @@
#include <xrpl/resource/Fees.h>
#include <xrpl/server/LoadFeeTrack.h>
#include <xrpl/server/Wallet.h>
#include <xrpl/telemetry/Telemetry.h>
#include <xrpl/tx/apply.h>
#include <boost/algorithm/string/predicate.hpp>
@@ -147,6 +147,7 @@ public:
beast::Journal m_journal;
std::unique_ptr<perf::PerfLog> perfLog_;
std::unique_ptr<telemetry::Telemetry> telemetry_;
Application::MutexType m_masterMutex;
// Required by the SHAMapStore
@@ -258,6 +259,14 @@ public:
logs_->journal("PerfLog"),
[this] { signalStop("PerfLog"); }))
, telemetry_(
telemetry::make_Telemetry(
telemetry::setup_Telemetry(
config_->section("telemetry"),
"", // nodePublicKey not yet available at this point
BuildInfo::getVersionString()),
logs_->journal("Telemetry")))
, m_txMaster(*this)
, m_collectorManager(
@@ -618,6 +627,12 @@ public:
return *perfLog_;
}
telemetry::Telemetry&
getTelemetry() override
{
return *telemetry_;
}
NodeCache&
getTempNodeCache() override
{
@@ -1054,8 +1069,6 @@ public:
<< "; size after: " << cachedSLEs_.size();
}
mallocTrim("doSweep", m_journal);
// Set timer to do another sweep later.
setSweepTimer();
}
@@ -1472,6 +1485,7 @@ ApplicationImp::start(bool withTimers)
ledgerCleaner_->start();
perfLog_->start();
telemetry_->start();
}
void
@@ -1562,6 +1576,7 @@ ApplicationImp::run()
ledgerCleaner_->stop();
m_nodeStore->stop();
perfLog_->stop();
telemetry_->stop();
JLOG(m_journal.info()) << "Done.";
}