diff --git a/CMakeLists.txt b/CMakeLists.txt index 764e917498..953896012b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,6 +126,13 @@ if (rocksdb) target_link_libraries(xrpl_libs INTERFACE RocksDB::rocksdb) endif () +option(telemetry "Enable OpenTelemetry tracing" OFF) +if (telemetry) + find_package(opentelemetry-cpp CONFIG REQUIRED) + add_compile_definitions(XRPL_ENABLE_TELEMETRY) + message(STATUS "OpenTelemetry tracing enabled") +endif () + # Work around changes to Conan recipe for now. if (TARGET nudb::core) set(nudb nudb::core) diff --git a/OpenTelemetryPlan/POC_taskList.md b/OpenTelemetryPlan/POC_taskList.md new file mode 100644 index 0000000000..41f79ae097 --- /dev/null +++ b/OpenTelemetryPlan/POC_taskList.md @@ -0,0 +1,500 @@ +# OpenTelemetry POC Task List + +> **Goal**: Build a minimal end-to-end proof of concept that demonstrates distributed tracing in rippled. A successful POC will show RPC request traces flowing from rippled through an OTel Collector into Jaeger, viewable in a browser UI. +> +> **Scope**: RPC tracing only (highest value, lowest risk per the [CRAWL phase](./06-implementation-phases.md#6102-quick-wins-immediate-value) in the implementation phases). No cross-node P2P context propagation or consensus tracing in the POC. + +### Related Plan Documents + +| Document | Relevance to POC | +|----------|-----------------| +| [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) | Core concepts: traces, spans, context propagation, sampling | +| [01-architecture-analysis.md](./01-architecture-analysis.md) | RPC request flow (§1.5), key trace points (§1.6), instrumentation priority (§1.7) | +| [02-design-decisions.md](./02-design-decisions.md) | SDK selection (§2.1), exporter config (§2.2), span naming (§2.3), attribute schema (§2.4), coexistence with PerfLog/Insight (§2.6) | +| [03-implementation-strategy.md](./03-implementation-strategy.md) | Directory structure (§3.1), key principles (§3.2), performance overhead (§3.3-3.6), conditional compilation (§3.7.3), code intrusiveness (§3.9) | +| [04-code-samples.md](./04-code-samples.md) | Telemetry interface (§4.1), SpanGuard (§4.2), macros (§4.3), RPC instrumentation (§4.5.3) | +| [05-configuration-reference.md](./05-configuration-reference.md) | rippled config (§5.1), config parser (§5.2), Application integration (§5.3), CMake (§5.4), Collector config (§5.5), Docker Compose (§5.6), Grafana (§5.8) | +| [06-implementation-phases.md](./06-implementation-phases.md) | Phase 1 core tasks (§6.2), Phase 2 RPC tasks (§6.3), quick wins (§6.10), definition of done (§6.11) | +| [07-observability-backends.md](./07-observability-backends.md) | Jaeger dev setup (§7.1), Grafana dashboards (§7.6), alert rules (§7.6.3) | + +--- + +## Task 0: Docker Observability Stack Setup + +**Objective**: Stand up the backend infrastructure to receive, store, and display traces. + +**What to do**: +- Create `docker/telemetry/docker-compose.yml` in the repo with three services: + 1. **OpenTelemetry Collector** (`otel/opentelemetry-collector-contrib:latest`) + - Expose ports `4317` (OTLP gRPC) and `4318` (OTLP HTTP) + - Expose port `13133` (health check) + - Mount a config file `docker/telemetry/otel-collector-config.yaml` + 2. **Jaeger** (`jaegertracing/all-in-one:latest`) + - Expose port `16686` (UI) and `14250` (gRPC collector) + - Set env `COLLECTOR_OTLP_ENABLED=true` + 3. **Grafana** (`grafana/grafana:latest`) — optional but useful + - Expose port `3000` + - Enable anonymous admin access for local dev (`GF_AUTH_ANONYMOUS_ENABLED=true`, `GF_AUTH_ANONYMOUS_ORG_ROLE=Admin`) + - Provision Jaeger as a data source via `docker/telemetry/grafana/provisioning/datasources/jaeger.yaml` + +- Create `docker/telemetry/otel-collector-config.yaml`: + ```yaml + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + batch: + timeout: 1s + send_batch_size: 100 + + exporters: + logging: + verbosity: detailed + otlp/jaeger: + endpoint: jaeger:4317 + tls: + insecure: true + + service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [logging, otlp/jaeger] + ``` + +- Create Grafana Jaeger datasource provisioning file at `docker/telemetry/grafana/provisioning/datasources/jaeger.yaml`: + ```yaml + apiVersion: 1 + datasources: + - name: Jaeger + type: jaeger + access: proxy + url: http://jaeger:16686 + ``` + +**Verification**: Run `docker compose -f docker/telemetry/docker-compose.yml up -d`, then: +- `curl http://localhost:13133` returns healthy (Collector) +- `http://localhost:16686` opens Jaeger UI (no traces yet) +- `http://localhost:3000` opens Grafana (optional) + +**Reference**: +- [05-configuration-reference.md §5.5](./05-configuration-reference.md) — Collector config (dev YAML with Jaeger exporter) +- [05-configuration-reference.md §5.6](./05-configuration-reference.md) — Docker Compose development environment +- [07-observability-backends.md §7.1](./07-observability-backends.md) — Jaeger quick start and backend selection +- [05-configuration-reference.md §5.8](./05-configuration-reference.md) — Grafana datasource provisioning and dashboards + +--- + +## Task 1: Add OpenTelemetry C++ SDK Dependency + +**Objective**: Make `opentelemetry-cpp` available to the build system. + +**What to do**: +- Edit `conanfile.py` to add `opentelemetry-cpp` as an **optional** dependency. The gRPC otel plugin flag (`"grpc/*:otel_plugin": False`) in the existing conanfile may need to remain false — we pull the OTel SDK separately. + - Add a Conan option: `with_telemetry = [True, False]` defaulting to `False` + - When `with_telemetry` is `True`, add `opentelemetry-cpp` to `self.requires()` + - Required OTel Conan components: `opentelemetry-cpp` (which bundles api, sdk, and exporters). If the package isn't in Conan Center, consider using `FetchContent` in CMake or building from source as a fallback. +- Edit `CMakeLists.txt`: + - Add option: `option(XRPL_ENABLE_TELEMETRY "Enable OpenTelemetry tracing" OFF)` + - When ON, `find_package(opentelemetry-cpp CONFIG REQUIRED)` and add compile definition `XRPL_ENABLE_TELEMETRY` + - When OFF, do nothing (zero build impact) +- Verify the build succeeds with `-DXRPL_ENABLE_TELEMETRY=OFF` (no regressions) and with `-DXRPL_ENABLE_TELEMETRY=ON` (SDK links successfully). + +**Key files**: +- `/home/pratik/sourceCode/2rippled/conanfile.py` +- `/home/pratik/sourceCode/2rippled/CMakeLists.txt` + +**Reference**: +- [05-configuration-reference.md §5.4](./05-configuration-reference.md) — CMake integration, `FindOpenTelemetry.cmake`, `XRPL_ENABLE_TELEMETRY` option +- [03-implementation-strategy.md §3.2](./03-implementation-strategy.md) — Key principle: zero-cost when disabled via compile-time flags +- [02-design-decisions.md §2.1](./02-design-decisions.md) — SDK selection rationale and required OTel components + +--- + +## Task 2: Create Core Telemetry Interface and NullTelemetry + +**Objective**: Define the `Telemetry` abstract interface and a no-op implementation so the rest of the codebase can reference telemetry without hard-depending on the OTel SDK. + +**What to do**: +- Create `include/xrpl/telemetry/Telemetry.h`: + - Define `namespace xrpl::telemetry` + - Define `struct Telemetry::Setup` holding: `enabled`, `exporterEndpoint`, `samplingRatio`, `serviceName`, `serviceVersion`, `serviceInstanceId`, `traceRpc`, `traceTransactions`, `traceConsensus`, `tracePeer` + - Define abstract `class Telemetry` with: + - `virtual void start() = 0;` + - `virtual void stop() = 0;` + - `virtual bool isEnabled() const = 0;` + - `virtual nostd::shared_ptr getTracer(string_view name = "rippled") = 0;` + - `virtual nostd::shared_ptr startSpan(string_view name, SpanKind kind = kInternal) = 0;` + - `virtual nostd::shared_ptr startSpan(string_view name, Context const& parentContext, SpanKind kind = kInternal) = 0;` + - `virtual bool shouldTraceRpc() const = 0;` + - `virtual bool shouldTraceTransactions() const = 0;` + - `virtual bool shouldTraceConsensus() const = 0;` + - Factory: `std::unique_ptr make_Telemetry(Setup const&, beast::Journal);` + - Config parser: `Telemetry::Setup setup_Telemetry(Section const&, std::string const& nodePublicKey, std::string const& version);` + +- Create `include/xrpl/telemetry/SpanGuard.h`: + - RAII guard that takes an `nostd::shared_ptr`, creates a `Scope`, and calls `span->End()` in destructor. + - Convenience: `setAttribute()`, `setOk()`, `setStatus()`, `addEvent()`, `recordException()`, `context()` + - See [04-code-samples.md](./04-code-samples.md) §4.2 for the full implementation. + +- Create `src/libxrpl/telemetry/NullTelemetry.cpp`: + - Implements `Telemetry` with all no-ops. + - `isEnabled()` returns `false`, `startSpan()` returns a noop span. + - This is used when `XRPL_ENABLE_TELEMETRY` is OFF or `enabled=0` in config. + +- Guard all OTel SDK headers behind `#ifdef XRPL_ENABLE_TELEMETRY`. The `NullTelemetry` implementation should compile without the OTel SDK present. + +**Key new files**: +- `include/xrpl/telemetry/Telemetry.h` +- `include/xrpl/telemetry/SpanGuard.h` +- `src/libxrpl/telemetry/NullTelemetry.cpp` + +**Reference**: +- [04-code-samples.md §4.1](./04-code-samples.md) — Full `Telemetry` interface with `Setup` struct, lifecycle, tracer access, span creation, and component filtering methods +- [04-code-samples.md §4.2](./04-code-samples.md) — Full `SpanGuard` RAII implementation and `NullSpanGuard` no-op class +- [03-implementation-strategy.md §3.1](./03-implementation-strategy.md) — Directory structure: `include/xrpl/telemetry/` for headers, `src/libxrpl/telemetry/` for implementation +- [03-implementation-strategy.md §3.7.3](./03-implementation-strategy.md) — Conditional instrumentation and zero-cost compile-time disabled pattern + +--- + +## Task 3: Implement OTel-Backed Telemetry + +**Objective**: Implement the real `Telemetry` class that initializes the OTel SDK, configures the OTLP exporter and batch processor, and creates tracers/spans. + +**What to do**: +- Create `src/libxrpl/telemetry/Telemetry.cpp` (compiled only when `XRPL_ENABLE_TELEMETRY=ON`): + - `class TelemetryImpl : public Telemetry` that: + - In `start()`: creates a `TracerProvider` with: + - Resource attributes: `service.name`, `service.version`, `service.instance.id` + - An `OtlpGrpcExporter` pointed at `setup.exporterEndpoint` (default `localhost:4317`) + - A `BatchSpanProcessor` with configurable batch size and delay + - A `TraceIdRatioBasedSampler` using `setup.samplingRatio` + - Sets the global `TracerProvider` + - In `stop()`: calls `ForceFlush()` then shuts down the provider + - In `startSpan()`: delegates to `getTracer()->StartSpan(name, ...)` + - `shouldTraceRpc()` etc. read from `Setup` fields + +- Create `src/libxrpl/telemetry/TelemetryConfig.cpp`: + - `setup_Telemetry()` parses the `[telemetry]` config section from `xrpld.cfg` + - Maps config keys: `enabled`, `exporter`, `endpoint`, `sampling_ratio`, `trace_rpc`, `trace_transactions`, `trace_consensus`, `trace_peer` + +- Wire `make_Telemetry()` factory: + - If `setup.enabled` is true AND `XRPL_ENABLE_TELEMETRY` is defined: return `TelemetryImpl` + - Otherwise: return `NullTelemetry` + +- Add telemetry source files to CMake. When `XRPL_ENABLE_TELEMETRY=ON`, compile `Telemetry.cpp` and `TelemetryConfig.cpp` and link against `opentelemetry-cpp::api`, `opentelemetry-cpp::sdk`, `opentelemetry-cpp::otlp_grpc_exporter`. When OFF, compile only `NullTelemetry.cpp`. + +**Key new files**: +- `src/libxrpl/telemetry/Telemetry.cpp` +- `src/libxrpl/telemetry/TelemetryConfig.cpp` + +**Key modified files**: +- `CMakeLists.txt` (add telemetry library target) + +**Reference**: +- [04-code-samples.md §4.1](./04-code-samples.md) — `Telemetry` interface that `TelemetryImpl` must implement +- [05-configuration-reference.md §5.2](./05-configuration-reference.md) — `setup_Telemetry()` config parser implementation +- [02-design-decisions.md §2.2](./02-design-decisions.md) — OTLP/gRPC exporter config (endpoint, TLS options) +- [02-design-decisions.md §2.4.1](./02-design-decisions.md) — Resource attributes: `service.name`, `service.version`, `service.instance.id`, `xrpl.network.id` +- [03-implementation-strategy.md §3.4](./03-implementation-strategy.md) — Per-operation CPU costs and overhead budget for span creation +- [03-implementation-strategy.md §3.5](./03-implementation-strategy.md) — Memory overhead: static (~456 KB) and dynamic (~1.2 MB) budgets + +--- + +## Task 4: Integrate Telemetry into Application Lifecycle + +**Objective**: Wire the `Telemetry` object into `Application` so all components can access it. + +**What to do**: +- Edit `src/xrpld/app/main/Application.h`: + - Forward-declare `namespace xrpl::telemetry { class Telemetry; }` + - Add pure virtual method: `virtual telemetry::Telemetry& getTelemetry() = 0;` + +- Edit `src/xrpld/app/main/Application.cpp` (the `ApplicationImp` class): + - Add member: `std::unique_ptr telemetry_;` + - In the constructor, after config is loaded and node identity is known: + ```cpp + auto const telemetrySection = config_->section("telemetry"); + auto telemetrySetup = telemetry::setup_Telemetry( + telemetrySection, + toBase58(TokenType::NodePublic, nodeIdentity_.publicKey()), + BuildInfo::getVersionString()); + telemetry_ = telemetry::make_Telemetry(telemetrySetup, logs_->journal("Telemetry")); + ``` + - In `start()`: call `telemetry_->start()` early + - In `stop()` or destructor: call `telemetry_->stop()` late (to flush pending spans) + - Implement `getTelemetry()` override: return `*telemetry_` + +- Add `[telemetry]` section to the example config `cfg/rippled-example.cfg`: + ```ini + # [telemetry] + # enabled=1 + # endpoint=localhost:4317 + # sampling_ratio=1.0 + # trace_rpc=1 + ``` + +**Key modified files**: +- `src/xrpld/app/main/Application.h` +- `src/xrpld/app/main/Application.cpp` +- `cfg/rippled-example.cfg` (or equivalent example config) + +**Reference**: +- [05-configuration-reference.md §5.3](./05-configuration-reference.md) — `ApplicationImp` changes: member declaration, constructor init, `start()`/`stop()` wiring, `getTelemetry()` override +- [05-configuration-reference.md §5.1](./05-configuration-reference.md) — `[telemetry]` config section format and all option defaults +- [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact assessment: `Application.cpp` ~15 lines added, ~3 changed (Low risk) + +--- + +## Task 5: Create Instrumentation Macros + +**Objective**: Define convenience macros that make instrumenting code one-liners, and that compile to zero-cost no-ops when telemetry is disabled. + +**What to do**: +- Create `src/xrpld/telemetry/TracingInstrumentation.h`: + - When `XRPL_ENABLE_TELEMETRY` is defined: + ```cpp + #define XRPL_TRACE_SPAN(telemetry, name) \ + auto _xrpl_span_ = (telemetry).startSpan(name); \ + ::xrpl::telemetry::SpanGuard _xrpl_guard_(_xrpl_span_) + + #define XRPL_TRACE_RPC(telemetry, name) \ + std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ + if ((telemetry).shouldTraceRpc()) { \ + _xrpl_guard_.emplace((telemetry).startSpan(name)); \ + } + + #define XRPL_TRACE_SET_ATTR(key, value) \ + if (_xrpl_guard_.has_value()) { \ + _xrpl_guard_->setAttribute(key, value); \ + } + + #define XRPL_TRACE_EXCEPTION(e) \ + if (_xrpl_guard_.has_value()) { \ + _xrpl_guard_->recordException(e); \ + } + ``` + - When `XRPL_ENABLE_TELEMETRY` is NOT defined, all macros expand to `((void)0)` + +**Key new file**: +- `src/xrpld/telemetry/TracingInstrumentation.h` + +**Reference**: +- [04-code-samples.md §4.3](./04-code-samples.md) — Full macro definitions for `XRPL_TRACE_SPAN`, `XRPL_TRACE_RPC`, `XRPL_TRACE_CONSENSUS`, `XRPL_TRACE_SET_ATTR`, `XRPL_TRACE_EXCEPTION` with both enabled and disabled branches +- [03-implementation-strategy.md §3.7.3](./03-implementation-strategy.md) — Conditional instrumentation pattern: compile-time `#ifndef` and runtime `shouldTrace*()` checks +- [03-implementation-strategy.md §3.9.7](./03-implementation-strategy.md) — Before/after code examples showing minimal intrusiveness (~1-3 lines per instrumentation point) + +--- + +## Task 6: Instrument RPC ServerHandler + +**Objective**: Add tracing to the HTTP RPC entry point so every incoming RPC request creates a span. + +**What to do**: +- Edit `src/xrpld/rpc/detail/ServerHandler.cpp`: + - `#include` the `TracingInstrumentation.h` header + - In `ServerHandler::onRequest(Session& session)`: + - At the top of the method, add: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.request");` + - After the RPC command name is extracted, set attribute: `XRPL_TRACE_SET_ATTR("xrpl.rpc.command", command);` + - After the response status is known, set: `XRPL_TRACE_SET_ATTR("http.status_code", static_cast(statusCode));` + - Wrap error paths with: `XRPL_TRACE_EXCEPTION(e);` + - In `ServerHandler::processRequest(...)`: + - Add a child span: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.process");` + - Set method attribute: `XRPL_TRACE_SET_ATTR("xrpl.rpc.method", request_method);` + - In `ServerHandler::onWSMessage(...)` (WebSocket path): + - Add: `XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.ws.message");` + +- The goal is to see spans like: + ``` + rpc.request + └── rpc.process + ``` + in Jaeger for every HTTP RPC call. + +**Key modified file**: +- `src/xrpld/rpc/detail/ServerHandler.cpp` (~15-25 lines added) + +**Reference**: +- [04-code-samples.md §4.5.3](./04-code-samples.md) — Complete `ServerHandler::onRequest()` instrumented code sample with W3C header extraction, span creation, attribute setting, and error handling +- [01-architecture-analysis.md §1.5](./01-architecture-analysis.md) — RPC request flow diagram: HTTP request -> attributes -> jobqueue.enqueue -> rpc.command -> response +- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.request` in `ServerHandler.cpp::onRequest()` (Priority: High) +- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming convention: `rpc.request`, `rpc.command.*` +- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.params` +- [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact: `ServerHandler.cpp` ~40 lines added, ~10 changed (Low risk) + +--- + +## Task 7: Instrument RPC Command Execution + +**Objective**: Add per-command tracing inside the RPC handler so each command (e.g., `submit`, `account_info`, `server_info`) gets its own child span. + +**What to do**: +- Edit `src/xrpld/rpc/detail/RPCHandler.cpp`: + - `#include` the `TracingInstrumentation.h` header + - In `doCommand(RPC::JsonContext& context, Json::Value& result)`: + - At the top: `XRPL_TRACE_RPC(context.app.getTelemetry(), "rpc.command." + context.method);` + - Set attributes: + - `XRPL_TRACE_SET_ATTR("xrpl.rpc.command", context.method);` + - `XRPL_TRACE_SET_ATTR("xrpl.rpc.version", static_cast(context.apiVersion));` + - `XRPL_TRACE_SET_ATTR("xrpl.rpc.role", (context.role == Role::ADMIN) ? "admin" : "user");` + - On success: `XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "success");` + - On error: `XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "error");` and set the error message + +- After this, traces in Jaeger should look like: + ``` + rpc.request (xrpl.rpc.command=account_info) + └── rpc.process + └── rpc.command.account_info (xrpl.rpc.version=2, xrpl.rpc.role=user, xrpl.rpc.status=success) + ``` + +**Key modified file**: +- `src/xrpld/rpc/detail/RPCHandler.cpp` (~15-20 lines added) + +**Reference**: +- [04-code-samples.md §4.5.3](./04-code-samples.md) — `ServerHandler::onRequest()` code sample (includes child span pattern for `rpc.command.*`) +- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming: `rpc.command.*` pattern with dynamic command name (e.g., `rpc.command.server_info`) +- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.status` +- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.command.*` in `RPCHandler.cpp::doCommand()` (Priority: High) +- [02-design-decisions.md §2.6.5](./02-design-decisions.md) — Correlation with PerfLog: how `doCommand()` can link trace_id with existing PerfLog entries +- [03-implementation-strategy.md §3.4.4](./03-implementation-strategy.md) — RPC request overhead budget: ~1.75 μs total per request + +--- + +## Task 8: Build, Run, and Verify End-to-End + +**Objective**: Prove the full pipeline works: rippled emits traces -> OTel Collector receives them -> Jaeger displays them. + +**What to do**: + +1. **Start the Docker stack**: + ```bash + docker compose -f docker/telemetry/docker-compose.yml up -d + ``` + Verify Collector health: `curl http://localhost:13133` + +2. **Build rippled with telemetry**: + ```bash + # Adjust for your actual build workflow + conan install . --build=missing -o with_telemetry=True + cmake --preset default -DXRPL_ENABLE_TELEMETRY=ON + cmake --build --preset default + ``` + +3. **Configure rippled**: + Add to `rippled.cfg` (or your local test config): + ```ini + [telemetry] + enabled=1 + endpoint=localhost:4317 + sampling_ratio=1.0 + trace_rpc=1 + ``` + +4. **Start rippled** in standalone mode: + ```bash + ./rippled --conf rippled.cfg -a --start + ``` + +5. **Generate RPC traffic**: + ```bash + # server_info + curl -s -X POST http://localhost:5005 \ + -H "Content-Type: application/json" \ + -d '{"method":"server_info","params":[{}]}' + + # ledger + curl -s -X POST http://localhost:5005 \ + -H "Content-Type: application/json" \ + -d '{"method":"ledger","params":[{"ledger_index":"current"}]}' + + # account_info (will error in standalone, that's fine — we trace errors too) + curl -s -X POST http://localhost:5005 \ + -H "Content-Type: application/json" \ + -d '{"method":"account_info","params":[{"account":"rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh"}]}' + ``` + +6. **Verify in Jaeger**: + - Open `http://localhost:16686` + - Select service `rippled` from the dropdown + - Click "Find Traces" + - Confirm you see traces with spans: `rpc.request` -> `rpc.process` -> `rpc.command.server_info` + - Click into a trace and verify attributes: `xrpl.rpc.command`, `xrpl.rpc.status`, `xrpl.rpc.version` + +7. **Verify zero-overhead when disabled**: + - Rebuild with `XRPL_ENABLE_TELEMETRY=OFF`, or set `enabled=0` in config + - Run the same RPC calls + - Confirm no new traces appear and no errors in rippled logs + +**Verification Checklist**: +- [ ] Docker stack starts without errors +- [ ] rippled builds with `-DXRPL_ENABLE_TELEMETRY=ON` +- [ ] rippled starts and connects to OTel Collector (check rippled logs for telemetry messages) +- [ ] Traces appear in Jaeger UI under service "rippled" +- [ ] Span hierarchy is correct (parent-child relationships) +- [ ] Span attributes are populated (`xrpl.rpc.command`, `xrpl.rpc.status`, etc.) +- [ ] Error spans show error status and message +- [ ] Building with `XRPL_ENABLE_TELEMETRY=OFF` produces no regressions +- [ ] Setting `enabled=0` at runtime produces no traces and no errors + +**Reference**: +- [06-implementation-phases.md §6.11.1](./06-implementation-phases.md) — Phase 1 definition of done: SDK compiles, runtime toggle works, span creation verified in Jaeger, config validation passes +- [06-implementation-phases.md §6.11.2](./06-implementation-phases.md) — Phase 2 definition of done: 100% RPC coverage, traceparent propagation, <1ms p99 overhead, dashboard deployed +- [06-implementation-phases.md §6.8](./06-implementation-phases.md) — Success metrics: trace coverage >95%, CPU overhead <3%, memory <5 MB, latency impact <2% +- [03-implementation-strategy.md §3.9.5](./03-implementation-strategy.md) — Backward compatibility: config optional, protocol unchanged, `XRPL_ENABLE_TELEMETRY=OFF` produces identical binary +- [01-architecture-analysis.md §1.8](./01-architecture-analysis.md) — Observable outcomes: what traces, metrics, and dashboards to expect + +--- + +## Task 9: Document POC Results and Next Steps + +**Objective**: Capture findings, screenshots, and remaining work for the team. + +**What to do**: +- Take screenshots of Jaeger showing: + - The service list with "rippled" + - A trace with the full span tree + - Span detail view showing attributes +- Document any issues encountered (build issues, SDK quirks, missing attributes) +- Note performance observations (build time impact, any noticeable runtime overhead) +- Write a short summary of what the POC proves and what it doesn't cover yet: + - **Proves**: OTel SDK integrates with rippled, OTLP export works, RPC traces visible + - **Doesn't cover**: Cross-node P2P context propagation, consensus tracing, protobuf trace context, W3C traceparent header extraction, tail-based sampling, production deployment +- Outline next steps (mapping to the full plan phases): + - [Phase 2](./06-implementation-phases.md) completion: [W3C header extraction](./02-design-decisions.md) (§2.5), WebSocket tracing, all [RPC handlers](./01-architecture-analysis.md) (§1.6) + - [Phase 3](./06-implementation-phases.md): [Protobuf `TraceContext` message](./04-code-samples.md) (§4.4), [transaction relay tracing](./04-code-samples.md) (§4.5.1) across nodes + - [Phase 4](./06-implementation-phases.md): [Consensus round and phase tracing](./04-code-samples.md) (§4.5.2) + - [Phase 5](./06-implementation-phases.md): [Production collector config](./05-configuration-reference.md) (§5.5.2), [Grafana dashboards](./07-observability-backends.md) (§7.6), [alerting](./07-observability-backends.md) (§7.6.3) + +**Reference**: +- [06-implementation-phases.md §6.1](./06-implementation-phases.md) — Full 5-phase timeline overview and Gantt chart +- [06-implementation-phases.md §6.10](./06-implementation-phases.md) — Crawl-Walk-Run strategy: POC is the CRAWL phase, next steps are WALK and RUN +- [06-implementation-phases.md §6.12](./06-implementation-phases.md) — Recommended implementation order (14 steps across 9 weeks) +- [03-implementation-strategy.md §3.9](./03-implementation-strategy.md) — Code intrusiveness assessment and risk matrix for each remaining component +- [07-observability-backends.md §7.2](./07-observability-backends.md) — Production backend selection (Tempo, Elastic APM, Honeycomb, Datadog) +- [02-design-decisions.md §2.5](./02-design-decisions.md) — Context propagation design: W3C HTTP headers, protobuf P2P, JobQueue internal +- [00-tracing-fundamentals.md](./00-tracing-fundamentals.md) — Reference for team onboarding on distributed tracing concepts + +--- + +## Summary + +| Task | Description | New Files | Modified Files | Depends On | +|------|--------------------------------------|-----------|----------------|------------| +| 0 | Docker observability stack | 4 | 0 | — | +| 1 | OTel C++ SDK dependency | 0 | 2 | — | +| 2 | Core Telemetry interface + NullImpl | 3 | 0 | 1 | +| 3 | OTel-backed Telemetry implementation | 2 | 1 | 1, 2 | +| 4 | Application lifecycle integration | 0 | 3 | 2, 3 | +| 5 | Instrumentation macros | 1 | 0 | 2 | +| 6 | Instrument RPC ServerHandler | 0 | 1 | 4, 5 | +| 7 | Instrument RPC command execution | 0 | 1 | 4, 5 | +| 8 | End-to-end verification | 0 | 0 | 0-7 | +| 9 | Document results and next steps | 1 | 0 | 8 | + +**Parallel work**: Tasks 0 and 1 can run in parallel. Tasks 2 and 5 have no dependency on each other. Tasks 6 and 7 can be done in parallel once Tasks 4 and 5 are complete. diff --git a/cfg/xrpld-example.cfg b/cfg/xrpld-example.cfg index 995d4e65ff..4287e48968 100644 --- a/cfg/xrpld-example.cfg +++ b/cfg/xrpld-example.cfg @@ -1433,14 +1433,17 @@ # To accept connections on well known ports such as 80 (HTTP) or # 443 (HTTPS), most operating systems will require xrpld to # run with administrator privileges, or else xrpld will not start. +[network_id] +0 + +[logging] +level=debug [server] port_rpc_admin_local -port_peer port_ws_admin_local -#port_ws_public -#ssl_key = /etc/ssl/private/server.key -#ssl_cert = /etc/ssl/certs/server.crt +port_ws_public +port_peer [port_rpc_admin_local] port = 5005 @@ -1449,13 +1452,8 @@ admin = 127.0.0.1 protocol = http [port_peer] -# Many servers still use the legacy port of 51235, so for backward-compatibility -# we maintain that port number here. However, for new servers we recommend -# changing this to the default port of 2459. port = 51235 -ip = 0.0.0.0 -# alternatively, to accept connections on IPv4 + IPv6, use: -#ip = :: +ip = 127.0.0.1 protocol = peer [port_ws_admin_local] @@ -1465,67 +1463,102 @@ admin = 127.0.0.1 protocol = ws send_queue_limit = 500 +[port_ws_public] +port = 6005 +ip = 127.0.0.1 +protocol = ws +send_queue_limit = 500 + [port_grpc] port = 50051 ip = 127.0.0.1 secure_gateway = 127.0.0.1 -#[port_ws_public] -#port = 6005 -#ip = 127.0.0.1 -#protocol = wss -#send_queue_limit = 500 +[validator_token] +eyJtYW5pZmVzdCI6IkpBQUFBQUZ4SWUxdFQrbVpjK2g0akY3bFpXaWdHRVZMOThmRlJ1VnJz +K1hTa0ZOQ2dSV0daSE1oQW91ZnZrMUFtSjU3ZUQ5dUdIRmZjS3Qrd1FFUXRyUDJzcnBOTDR4 +MnJ3RjBka1l3UkFJZ1JOLzlndUZnUXh0UWRwYjNiSlFvQWk1VHY2WFRhRm5vd1ZKb2lvalJh +eklDSUJ3S1FQRXE4UjB4anFWUnIxOTZXeFlxcm8wZWoyY2JSM0lqdGtnT2ZCMmFjQkpBdGpU +Rkpzb2VRZU05ZlZjcnNBOGMxVVRzVWQxK0RJbXphck5JVFhIdHF2WkRhZWh6UHBqSTF1NjBG +MEpiVnAvWGtiSXovNVFyNWFnek9xTkFOa1RXQWc9PSIsInZhbGlkYXRpb25fc2VjcmV0X2tl +eSI6IjNBNURCMzczMTMxQ0E5QThGMjJDQzkxRTBGOUY2NEM2MjBGQzBCRUE2MUM1OEZFRkRC +NThDRjIyMzc5ODg3RkIifQ== -#------------------------------------------------------------------------------- +[validation_quorum] +quorum = 3 + +[node_size] +medium + +# [node_db] +# type=RocksDB +# path=/var/lib/rippled/db -# This is primary persistent datastore for xrpld. This includes transaction -# metadata, account states, and ledger headers. Helpful information can be -# found at https://xrpl.org/capacity-planning.html#node-db-type -# type=NuDB is recommended for non-validators with fast SSDs. Validators or -# slow / spinning disks should use RocksDB. Caution: Spinning disks are -# not recommended. They do not perform well enough to consistently remain -# synced to the network. -# online_delete=512 is recommended to delete old ledgers while maintaining at -# least 512. -# advisory_delete=0 allows the online delete process to run automatically -# when the node has approximately two times the "online_delete" value of -# ledgers. No external administrative command is required to initiate -# deletion. [node_db] type=NuDB -path=/var/lib/xrpld/db/nudb -nudb_block_size=4096 +path=~/data/livenet/db/nudb online_delete=512 advisory_delete=0 [database_path] -/var/lib/xrpld/db +~/data/livenet/db - -# This needs to be an absolute directory reference, not a relative one. -# Modify this value as required. [debug_logfile] -/var/log/xrpld/debug.log +~/data/livenet/debug.log -# To use the XRP test network -# (see https://xrpl.org/connect-your-rippled-to-the-xrp-test-net.html), -# use the following [ips] section: -# [ips] -# r.altnet.rippletest.net 51235 - -# File containing trusted validator keys or validator list publishers. -# Unless an absolute path is specified, it will be considered relative to the -# folder in which the xrpld.cfg file is located. [validators_file] validators.txt -# Turn down default logging to save disk space in the long run. -# Valid values here are trace, debug, info, warning, error, and fatal [rpc_startup] -{ "command": "log_level", "severity": "warning" } +{ "command": "log_level", "severity": "info" } -# If ssl_verify is 1, certificates will be validated. -# To allow the use of self-signed certificates for development or internal use, -# set to ssl_verify to 0. [ssl_verify] -1 +0 + +[shard_db] +path=~/data/livenet/db/nudb +max_historical_shards=12 + +#------------------------------------------------------------------------------- +# +# 11. Telemetry (OpenTelemetry Tracing) +# +#------------------------------------------------------------------------------- +# +# Enables distributed tracing via OpenTelemetry. Requires building with +# -DXRPL_ENABLE_TELEMETRY=ON (telemetry Conan option). +# +# [telemetry] +# +# enabled=0 +# +# Enable or disable telemetry at runtime. Default: 0 (disabled). +# +# endpoint=http://localhost:4318/v1/traces +# +# The OpenTelemetry Collector endpoint (OTLP/HTTP). Default: http://localhost:4318/v1/traces. +# +# exporter=otlp_http +# +# Exporter type: otlp_http. Default: otlp_http. +# +# sampling_ratio=1.0 +# +# Fraction of traces to sample (0.0 to 1.0). Default: 1.0 (all traces). +# +# trace_rpc=1 +# +# Enable RPC request tracing. Default: 1. +# +# trace_transactions=1 +# +# Enable transaction lifecycle tracing. Default: 1. +# +# trace_consensus=1 +# +# Enable consensus round tracing. Default: 1. +# +# trace_peer=0 +# +# Enable peer message tracing (high volume). Default: 0. +# diff --git a/cmake/XrplCore.cmake b/cmake/XrplCore.cmake index e54dd09953..9854ea8cb5 100644 --- a/cmake/XrplCore.cmake +++ b/cmake/XrplCore.cmake @@ -119,6 +119,15 @@ target_link_libraries( add_module(xrpl tx) target_link_libraries(xrpl.libxrpl.tx PUBLIC xrpl.libxrpl.ledger) +# Telemetry module +add_module(xrpl telemetry) +target_link_libraries(xrpl.libxrpl.telemetry PUBLIC xrpl.libxrpl.basics xrpl.libxrpl.beast) +if (telemetry) + target_link_libraries( + xrpl.libxrpl.telemetry + PUBLIC opentelemetry-cpp::opentelemetry-cpp) +endif () + add_library(xrpl.libxrpl) set_target_properties(xrpl.libxrpl PROPERTIES OUTPUT_NAME xrpl) @@ -144,6 +153,7 @@ target_link_modules( resource server shamap + telemetry tx) # All headers in libxrpl are in modules. diff --git a/cmake/XrplInstall.cmake b/cmake/XrplInstall.cmake index 4cbf381f87..e38c2d7b46 100644 --- a/cmake/XrplInstall.cmake +++ b/cmake/XrplInstall.cmake @@ -32,6 +32,7 @@ install(TARGETS common xrpl.libxrpl.resource xrpl.libxrpl.server xrpl.libxrpl.shamap + xrpl.libxrpl.telemetry xrpl.libxrpl.tx antithesis-sdk-cpp EXPORT XrplExports diff --git a/conanfile.py b/conanfile.py index 7300b537d9..ef4b883a49 100644 --- a/conanfile.py +++ b/conanfile.py @@ -22,6 +22,7 @@ class Xrpl(ConanFile): "rocksdb": [True, False], "shared": [True, False], "static": [True, False], + "telemetry": [True, False], "tests": [True, False], "unity": [True, False], "xrpld": [True, False], @@ -54,6 +55,7 @@ class Xrpl(ConanFile): "rocksdb": True, "shared": False, "static": True, + "telemetry": False, "tests": False, "unity": False, "xrpld": False, @@ -140,6 +142,8 @@ class Xrpl(ConanFile): self.requires("jemalloc/5.3.0") if self.options.rocksdb: self.requires("rocksdb/10.5.1") + if self.options.telemetry: + self.requires("opentelemetry-cpp/1.18.0") self.requires("xxhash/0.8.3", **transitive_headers_opt) exports_sources = ( @@ -168,6 +172,7 @@ class Xrpl(ConanFile): tc.variables["rocksdb"] = self.options.rocksdb tc.variables["BUILD_SHARED_LIBS"] = self.options.shared tc.variables["static"] = self.options.static + tc.variables["telemetry"] = self.options.telemetry tc.variables["unity"] = self.options.unity tc.variables["xrpld"] = self.options.xrpld tc.generate() @@ -220,3 +225,5 @@ class Xrpl(ConanFile): ] if self.options.rocksdb: libxrpl.requires.append("rocksdb::librocksdb") + if self.options.telemetry: + libxrpl.requires.append("opentelemetry-cpp::opentelemetry-cpp") diff --git a/docker/telemetry/docker-compose.yml b/docker/telemetry/docker-compose.yml new file mode 100644 index 0000000000..ccfab652c4 --- /dev/null +++ b/docker/telemetry/docker-compose.yml @@ -0,0 +1,44 @@ +version: "3.8" + +services: + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + command: ["--config=/etc/otel-collector-config.yaml"] + ports: + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + - "13133:13133" # Health check + volumes: + - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro + depends_on: + - jaeger + networks: + - rippled-telemetry + + jaeger: + image: jaegertracing/all-in-one:latest + environment: + - COLLECTOR_OTLP_ENABLED=true + ports: + - "16686:16686" # Jaeger UI + - "14250:14250" # gRPC + networks: + - rippled-telemetry + + grafana: + image: grafana/grafana:latest + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + ports: + - "3000:3000" + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning:ro + depends_on: + - jaeger + networks: + - rippled-telemetry + +networks: + rippled-telemetry: + driver: bridge diff --git a/docker/telemetry/grafana/provisioning/datasources/jaeger.yaml b/docker/telemetry/grafana/provisioning/datasources/jaeger.yaml new file mode 100644 index 0000000000..ca3a4319bd --- /dev/null +++ b/docker/telemetry/grafana/provisioning/datasources/jaeger.yaml @@ -0,0 +1,7 @@ +apiVersion: 1 + +datasources: + - name: Jaeger + type: jaeger + access: proxy + url: http://jaeger:16686 diff --git a/docker/telemetry/otel-collector-config.yaml b/docker/telemetry/otel-collector-config.yaml new file mode 100644 index 0000000000..4d27ecb913 --- /dev/null +++ b/docker/telemetry/otel-collector-config.yaml @@ -0,0 +1,27 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 100 + +exporters: + debug: + verbosity: detailed + otlp/jaeger: + endpoint: jaeger:4317 + tls: + insecure: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [debug, otlp/jaeger] diff --git a/include/xrpl/core/ServiceRegistry.h b/include/xrpl/core/ServiceRegistry.h index 1a2e33d5ee..200b1bf59c 100644 --- a/include/xrpl/core/ServiceRegistry.h +++ b/include/xrpl/core/ServiceRegistry.h @@ -19,6 +19,9 @@ class Manager; namespace perf { class PerfLog; } +namespace telemetry { +class Telemetry; +} // This is temporary until we migrate all code to use ServiceRegistry. class Application; @@ -205,6 +208,9 @@ public: virtual perf::PerfLog& getPerfLog() = 0; + virtual telemetry::Telemetry& + getTelemetry() = 0; + // Configuration and state virtual bool isStopping() const = 0; diff --git a/include/xrpl/telemetry/SpanGuard.h b/include/xrpl/telemetry/SpanGuard.h new file mode 100644 index 0000000000..79564881a5 --- /dev/null +++ b/include/xrpl/telemetry/SpanGuard.h @@ -0,0 +1,105 @@ +#pragma once + +#ifdef XRPL_ENABLE_TELEMETRY + +#include +#include +#include +#include + +#include +#include + +namespace xrpl { +namespace telemetry { + +class SpanGuard +{ + opentelemetry::nostd::shared_ptr span_; + opentelemetry::trace::Scope scope_; + +public: + explicit SpanGuard( + opentelemetry::nostd::shared_ptr span) + : span_(std::move(span)), scope_(span_) + { + } + + SpanGuard(SpanGuard const&) = delete; + SpanGuard& operator=(SpanGuard const&) = delete; + SpanGuard(SpanGuard&& other) noexcept + : span_(std::move(other.span_)), scope_(span_) + { + other.span_ = nullptr; + } + SpanGuard& operator=(SpanGuard&&) = delete; + + ~SpanGuard() + { + if (span_) + span_->End(); + } + + opentelemetry::trace::Span& + span() + { + return *span_; + } + + opentelemetry::trace::Span const& + span() const + { + return *span_; + } + + void + setOk() + { + span_->SetStatus(opentelemetry::trace::StatusCode::kOk); + } + + void + setStatus( + opentelemetry::trace::StatusCode code, + std::string_view description = "") + { + span_->SetStatus(code, std::string(description)); + } + + template + void + setAttribute(std::string_view key, T&& value) + { + span_->SetAttribute( + opentelemetry::nostd::string_view(key.data(), key.size()), + std::forward(value)); + } + + void + addEvent(std::string_view name) + { + span_->AddEvent(std::string(name)); + } + + void + recordException(std::exception const& e) + { + span_->AddEvent("exception", { + {"exception.type", "std::exception"}, + {"exception.message", std::string(e.what())} + }); + span_->SetStatus( + opentelemetry::trace::StatusCode::kError, e.what()); + } + + opentelemetry::context::Context + context() const + { + return opentelemetry::context::RuntimeContext::GetCurrent(); + } +}; + +} // namespace telemetry +} // namespace xrpl + +#endif // XRPL_ENABLE_TELEMETRY diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h new file mode 100644 index 0000000000..5710f9a698 --- /dev/null +++ b/include/xrpl/telemetry/Telemetry.h @@ -0,0 +1,92 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include + +#ifdef XRPL_ENABLE_TELEMETRY +#include +#include +#include +#include +#endif + +namespace xrpl { +namespace telemetry { + +class Telemetry +{ +public: + struct Setup + { + bool enabled = false; + std::string serviceName = "rippled"; + std::string serviceVersion; + std::string serviceInstanceId; + + std::string exporterType = "otlp_http"; + std::string exporterEndpoint = "http://localhost:4318/v1/traces"; + bool useTls = false; + std::string tlsCertPath; + + double samplingRatio = 1.0; + + std::uint32_t batchSize = 512; + std::chrono::milliseconds batchDelay{5000}; + std::uint32_t maxQueueSize = 2048; + + std::uint32_t networkId = 0; + std::string networkType = "mainnet"; + + bool traceTransactions = true; + bool traceConsensus = true; + bool traceRpc = true; + bool tracePeer = false; + bool traceLedger = true; + }; + + virtual ~Telemetry() = default; + + virtual void start() = 0; + virtual void stop() = 0; + virtual bool isEnabled() const = 0; + + virtual bool shouldTraceTransactions() const = 0; + virtual bool shouldTraceConsensus() const = 0; + virtual bool shouldTraceRpc() const = 0; + virtual bool shouldTracePeer() const = 0; + +#ifdef XRPL_ENABLE_TELEMETRY + virtual opentelemetry::nostd::shared_ptr + getTracer(std::string_view name = "rippled") = 0; + + virtual opentelemetry::nostd::shared_ptr + startSpan( + std::string_view name, + opentelemetry::trace::SpanKind kind = + opentelemetry::trace::SpanKind::kInternal) = 0; + + virtual opentelemetry::nostd::shared_ptr + startSpan( + std::string_view name, + opentelemetry::context::Context const& parentContext, + opentelemetry::trace::SpanKind kind = + opentelemetry::trace::SpanKind::kInternal) = 0; +#endif +}; + +std::unique_ptr +make_Telemetry(Telemetry::Setup const& setup, beast::Journal journal); + +Telemetry::Setup +setup_Telemetry( + Section const& section, + std::string const& nodePublicKey, + std::string const& version); + +} // namespace telemetry +} // namespace xrpl diff --git a/src/libxrpl/telemetry/NullTelemetry.cpp b/src/libxrpl/telemetry/NullTelemetry.cpp new file mode 100644 index 0000000000..5062b9da0f --- /dev/null +++ b/src/libxrpl/telemetry/NullTelemetry.cpp @@ -0,0 +1,103 @@ +#include + +#ifdef XRPL_ENABLE_TELEMETRY +#include +#endif + +namespace xrpl { +namespace telemetry { + +namespace { + +class NullTelemetry : public Telemetry +{ + Setup setup_; + +public: + explicit NullTelemetry(Setup const& setup) : setup_(setup) + { + } + + void + start() override + { + } + + void + stop() override + { + } + + bool + isEnabled() const override + { + return false; + } + + bool + shouldTraceTransactions() const override + { + return false; + } + + bool + shouldTraceConsensus() const override + { + return false; + } + + bool + shouldTraceRpc() const override + { + return false; + } + + bool + shouldTracePeer() const override + { + return false; + } + +#ifdef XRPL_ENABLE_TELEMETRY + opentelemetry::nostd::shared_ptr + getTracer(std::string_view) override + { + static auto noopTracer = + opentelemetry::nostd::shared_ptr( + new opentelemetry::trace::NoopTracer()); + return noopTracer; + } + + opentelemetry::nostd::shared_ptr + startSpan(std::string_view, opentelemetry::trace::SpanKind) override + { + return opentelemetry::nostd::shared_ptr( + new opentelemetry::trace::NoopSpan(nullptr)); + } + + opentelemetry::nostd::shared_ptr + startSpan( + std::string_view, + opentelemetry::context::Context const&, + opentelemetry::trace::SpanKind) override + { + return opentelemetry::nostd::shared_ptr( + new opentelemetry::trace::NoopSpan(nullptr)); + } +#endif +}; + +} // namespace + +// When XRPL_ENABLE_TELEMETRY is off OR setup.enabled is false, +// return NullTelemetry +#ifndef XRPL_ENABLE_TELEMETRY +std::unique_ptr +make_Telemetry(Telemetry::Setup const& setup, beast::Journal) +{ + return std::make_unique(setup); +} +#endif + +} // namespace telemetry +} // namespace xrpl diff --git a/src/libxrpl/telemetry/Telemetry.cpp b/src/libxrpl/telemetry/Telemetry.cpp new file mode 100644 index 0000000000..685c7c556d --- /dev/null +++ b/src/libxrpl/telemetry/Telemetry.cpp @@ -0,0 +1,272 @@ +#ifdef XRPL_ENABLE_TELEMETRY + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace xrpl { +namespace telemetry { + +namespace { + +namespace trace_api = opentelemetry::trace; +namespace trace_sdk = opentelemetry::sdk::trace; +namespace otlp_http = opentelemetry::exporter::otlp; +namespace resource = opentelemetry::sdk::resource; + +// A no-op implementation used when XRPL_ENABLE_TELEMETRY is defined +// but setup.enabled is false. This lives in the anonymous namespace +// so there is no ODR conflict with the NullTelemetry in +// NullTelemetry.cpp (which also lives in its own anonymous namespace). +class NullTelemetryOtel : public Telemetry +{ + Setup setup_; + +public: + explicit NullTelemetryOtel(Setup const& setup) : setup_(setup) + { + } + + void + start() override + { + } + + void + stop() override + { + } + + bool + isEnabled() const override + { + return false; + } + + bool + shouldTraceTransactions() const override + { + return false; + } + + bool + shouldTraceConsensus() const override + { + return false; + } + + bool + shouldTraceRpc() const override + { + return false; + } + + bool + shouldTracePeer() const override + { + return false; + } + + opentelemetry::nostd::shared_ptr + getTracer(std::string_view) override + { + static auto noopTracer = + opentelemetry::nostd::shared_ptr( + new trace_api::NoopTracer()); + return noopTracer; + } + + opentelemetry::nostd::shared_ptr + startSpan(std::string_view, trace_api::SpanKind) override + { + return opentelemetry::nostd::shared_ptr( + new trace_api::NoopSpan(nullptr)); + } + + opentelemetry::nostd::shared_ptr + startSpan( + std::string_view, + opentelemetry::context::Context const&, + trace_api::SpanKind) override + { + return opentelemetry::nostd::shared_ptr( + new trace_api::NoopSpan(nullptr)); + } +}; + +class TelemetryImpl : public Telemetry +{ + Setup setup_; + beast::Journal journal_; + std::shared_ptr sdkProvider_; + +public: + TelemetryImpl(Setup const& setup, beast::Journal journal) + : setup_(setup), journal_(journal) + { + } + + void + start() override + { + JLOG(journal_.info()) + << "Telemetry starting: endpoint=" << setup_.exporterEndpoint + << " sampling=" << setup_.samplingRatio; + + // Configure OTLP HTTP exporter + otlp_http::OtlpHttpExporterOptions exporterOpts; + exporterOpts.url = setup_.exporterEndpoint; + if (setup_.useTls) + exporterOpts.ssl_ca_cert_path = setup_.tlsCertPath; + + auto exporter = + otlp_http::OtlpHttpExporterFactory::Create(exporterOpts); + + // Configure batch processor + trace_sdk::BatchSpanProcessorOptions processorOpts; + processorOpts.max_queue_size = setup_.maxQueueSize; + processorOpts.schedule_delay_millis = + std::chrono::milliseconds(setup_.batchDelay); + processorOpts.max_export_batch_size = setup_.batchSize; + + auto processor = + trace_sdk::BatchSpanProcessorFactory::Create( + std::move(exporter), processorOpts); + + // Configure resource attributes + auto resourceAttrs = resource::Resource::Create({ + {resource::SemanticConventions::kServiceName, + setup_.serviceName}, + {resource::SemanticConventions::kServiceVersion, + setup_.serviceVersion}, + {resource::SemanticConventions::kServiceInstanceId, + setup_.serviceInstanceId}, + {"xrpl.network.id", + static_cast(setup_.networkId)}, + {"xrpl.network.type", setup_.networkType}, + }); + + // Configure sampler + auto sampler = + std::make_unique( + setup_.samplingRatio); + + // Create TracerProvider + sdkProvider_ = + trace_sdk::TracerProviderFactory::Create( + std::move(processor), + resourceAttrs, + std::move(sampler)); + + // Set as global provider + trace_api::Provider::SetTracerProvider( + opentelemetry::nostd::shared_ptr( + sdkProvider_)); + + JLOG(journal_.info()) << "Telemetry started successfully"; + } + + void + stop() override + { + JLOG(journal_.info()) << "Telemetry stopping"; + if (sdkProvider_) + { + // Force flush before shutdown + sdkProvider_->ForceFlush(); + sdkProvider_.reset(); + trace_api::Provider::SetTracerProvider( + opentelemetry::nostd::shared_ptr( + new trace_api::NoopTracerProvider())); + } + JLOG(journal_.info()) << "Telemetry stopped"; + } + + bool + isEnabled() const override + { + return true; + } + + bool + shouldTraceTransactions() const override + { + return setup_.traceTransactions; + } + + bool + shouldTraceConsensus() const override + { + return setup_.traceConsensus; + } + + bool + shouldTraceRpc() const override + { + return setup_.traceRpc; + } + + bool + shouldTracePeer() const override + { + return setup_.tracePeer; + } + + opentelemetry::nostd::shared_ptr + getTracer(std::string_view name) override + { + if (!sdkProvider_) + return trace_api::Provider::GetTracerProvider()->GetTracer( + std::string(name)); + return sdkProvider_->GetTracer(std::string(name)); + } + + opentelemetry::nostd::shared_ptr + startSpan(std::string_view name, trace_api::SpanKind kind) override + { + auto tracer = getTracer("rippled"); + trace_api::StartSpanOptions opts; + opts.kind = kind; + return tracer->StartSpan(std::string(name), opts); + } + + opentelemetry::nostd::shared_ptr + startSpan( + std::string_view name, + opentelemetry::context::Context const& parentContext, + trace_api::SpanKind kind) override + { + auto tracer = getTracer("rippled"); + trace_api::StartSpanOptions opts; + opts.kind = kind; + opts.parent = parentContext; + return tracer->StartSpan(std::string(name), opts); + } +}; + +} // namespace + +std::unique_ptr +make_Telemetry(Telemetry::Setup const& setup, beast::Journal journal) +{ + if (setup.enabled) + return std::make_unique(setup, journal); + return std::make_unique(setup); +} + +} // namespace telemetry +} // namespace xrpl + +#endif // XRPL_ENABLE_TELEMETRY diff --git a/src/libxrpl/telemetry/TelemetryConfig.cpp b/src/libxrpl/telemetry/TelemetryConfig.cpp new file mode 100644 index 0000000000..bb834bef52 --- /dev/null +++ b/src/libxrpl/telemetry/TelemetryConfig.cpp @@ -0,0 +1,50 @@ +#include + +namespace xrpl { +namespace telemetry { + +Telemetry::Setup +setup_Telemetry( + Section const& section, + std::string const& nodePublicKey, + std::string const& version) +{ + Telemetry::Setup setup; + + setup.enabled = section.value_or("enabled", 0) != 0; + setup.serviceName = section.value_or("service_name", "rippled"); + setup.serviceVersion = version; + setup.serviceInstanceId = + section.value_or("service_instance_id", nodePublicKey); + + setup.exporterType = + section.value_or("exporter", "otlp_http"); + setup.exporterEndpoint = section.value_or( + "endpoint", "http://localhost:4318/v1/traces"); + + setup.useTls = section.value_or("use_tls", 0) != 0; + setup.tlsCertPath = + section.value_or("tls_ca_cert", ""); + + setup.samplingRatio = section.value_or("sampling_ratio", 1.0); + + setup.batchSize = + section.value_or("batch_size", 512u); + setup.batchDelay = std::chrono::milliseconds{ + section.value_or("batch_delay_ms", 5000u)}; + setup.maxQueueSize = + section.value_or("max_queue_size", 2048u); + + setup.traceTransactions = + section.value_or("trace_transactions", 1) != 0; + setup.traceConsensus = + section.value_or("trace_consensus", 1) != 0; + setup.traceRpc = section.value_or("trace_rpc", 1) != 0; + setup.tracePeer = section.value_or("trace_peer", 0) != 0; + setup.traceLedger = section.value_or("trace_ledger", 1) != 0; + + return setup; +} + +} // namespace telemetry +} // namespace xrpl diff --git a/src/xrpld/app/main/Application.cpp b/src/xrpld/app/main/Application.cpp index 91cc387d54..e2721739e0 100644 --- a/src/xrpld/app/main/Application.cpp +++ b/src/xrpld/app/main/Application.cpp @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -146,6 +147,7 @@ public: beast::Journal m_journal; std::unique_ptr perfLog_; + std::unique_ptr telemetry_; Application::MutexType m_masterMutex; // Required by the SHAMapStore @@ -250,45 +252,50 @@ public: , m_journal(logs_->journal("Application")) // PerfLog must be started before any other threads are launched. - , perfLog_( - perf::make_PerfLog( - perf::setup_PerfLog(config_->section("perf"), config_->CONFIG_DIR), - *this, - logs_->journal("PerfLog"), - [this] { signalStop("PerfLog"); })) + , perfLog_(perf::make_PerfLog( + perf::setup_PerfLog(config_->section("perf"), config_->CONFIG_DIR), + *this, + logs_->journal("PerfLog"), + [this] { signalStop("PerfLog"); })) + + , telemetry_(telemetry::make_Telemetry( + telemetry::setup_Telemetry( + config_->section("telemetry"), + "", // nodePublicKey not yet available at this point + BuildInfo::getVersionString()), + logs_->journal("Telemetry"))) , m_txMaster(*this) , m_collectorManager( make_CollectorManager(config_->section(SECTION_INSIGHT), logs_->journal("Collector"))) - , m_jobQueue( - std::make_unique( - [](std::unique_ptr const& config) { - if (config->standalone() && !config->FORCE_MULTI_THREAD) - return 1; + , m_jobQueue(std::make_unique( + [](std::unique_ptr const& config) { + if (config->standalone() && !config->FORCE_MULTI_THREAD) + return 1; - if (config->WORKERS) - return config->WORKERS; + if (config->WORKERS) + return config->WORKERS; - auto count = static_cast(std::thread::hardware_concurrency()); + auto count = static_cast(std::thread::hardware_concurrency()); - // Be more aggressive about the number of threads to use - // for the job queue if the server is configured as - // "large" or "huge" if there are enough cores. - if (config->NODE_SIZE >= 4 && count >= 16) - count = 6 + std::min(count, 8); - else if (config->NODE_SIZE >= 3 && count >= 8) - count = 4 + std::min(count, 6); - else - count = 2 + std::min(count, 4); + // Be more aggressive about the number of threads to use + // for the job queue if the server is configured as + // "large" or "huge" if there are enough cores. + if (config->NODE_SIZE >= 4 && count >= 16) + count = 6 + std::min(count, 8); + else if (config->NODE_SIZE >= 3 && count >= 8) + count = 4 + std::min(count, 6); + else + count = 2 + std::min(count, 4); - return count; - }(config_), - m_collectorManager->group("jobq"), - logs_->journal("JobQueue"), - *logs_, - *perfLog_)) + return count; + }(config_), + m_collectorManager->group("jobq"), + logs_->journal("JobQueue"), + *logs_, + *perfLog_)) , m_nodeStoreScheduler(*m_jobQueue) @@ -323,18 +330,16 @@ public: , m_orderBookDB(make_OrderBookDB(*this, {config_->PATH_SEARCH_MAX, config_->standalone()})) - , m_pathRequests( - std::make_unique( - *this, - logs_->journal("PathRequest"), - m_collectorManager->collector())) + , m_pathRequests(std::make_unique( + *this, + logs_->journal("PathRequest"), + m_collectorManager->collector())) - , m_ledgerMaster( - std::make_unique( - *this, - stopwatch(), - m_collectorManager->collector(), - logs_->journal("LedgerMaster"))) + , m_ledgerMaster(std::make_unique( + *this, + stopwatch(), + m_collectorManager->collector(), + logs_->journal("LedgerMaster"))) , ledgerCleaner_(make_LedgerCleaner(*this, logs_->journal("LedgerCleaner"))) @@ -350,11 +355,10 @@ public: gotTXSet(set, fromAcquire); })) - , m_ledgerReplayer( - std::make_unique( - *this, - *m_inboundLedgers, - make_PeerSetBuilder(*this))) + , m_ledgerReplayer(std::make_unique( + *this, + *m_inboundLedgers, + make_PeerSetBuilder(*this))) , m_acceptedLedgerCache( "AcceptedLedger", @@ -385,14 +389,13 @@ public: , publisherManifests_(std::make_unique(logs_->journal("ManifestCache"))) - , validators_( - std::make_unique( - *validatorManifests_, - *publisherManifests_, - *timeKeeper_, - config_->legacy("database_path"), - logs_->journal("ValidatorList"), - config_->VALIDATION_QUORUM)) + , validators_(std::make_unique( + *validatorManifests_, + *publisherManifests_, + *timeKeeper_, + config_->legacy("database_path"), + logs_->journal("ValidatorList"), + config_->VALIDATION_QUORUM)) , validatorSites_(std::make_unique(*this)) @@ -617,6 +620,12 @@ public: return *perfLog_; } + telemetry::Telemetry& + getTelemetry() override + { + return *telemetry_; + } + NodeCache& getTempNodeCache() override { @@ -886,9 +895,8 @@ public: })) { using namespace std::chrono; - sweepTimer_.expires_after( - seconds{config_->SWEEP_INTERVAL.value_or( - config_->getValueFor(SizedItem::sweepInterval))}); + sweepTimer_.expires_after(seconds{ + config_->SWEEP_INTERVAL.value_or(config_->getValueFor(SizedItem::sweepInterval))}); sweepTimer_.async_wait(std::move(*optionalCountedHandler)); } } @@ -1463,6 +1471,7 @@ ApplicationImp::start(bool withTimers) ledgerCleaner_->start(); perfLog_->start(); + telemetry_->start(); } void @@ -1553,6 +1562,7 @@ ApplicationImp::run() ledgerCleaner_->stop(); m_nodeStore->stop(); perfLog_->stop(); + telemetry_->stop(); JLOG(m_journal.info()) << "Done."; } diff --git a/src/xrpld/rpc/detail/RPCHandler.cpp b/src/xrpld/rpc/detail/RPCHandler.cpp index 2066c8a5e9..6538d83d37 100644 --- a/src/xrpld/rpc/detail/RPCHandler.cpp +++ b/src/xrpld/rpc/detail/RPCHandler.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -157,6 +158,11 @@ template Status callMethod(JsonContext& context, Method method, std::string const& name, Object& result) { + XRPL_TRACE_RPC(context.app.getTelemetry(), "rpc.command." + name); + XRPL_TRACE_SET_ATTR("xrpl.rpc.command", name.c_str()); + XRPL_TRACE_SET_ATTR("xrpl.rpc.version", static_cast(context.apiVersion)); + XRPL_TRACE_SET_ATTR("xrpl.rpc.role", (context.role == Role::ADMIN ? "admin" : "user")); + static std::atomic requestId{0}; auto& perfLog = context.app.getPerfLog(); std::uint64_t const curId = ++requestId; @@ -172,12 +178,15 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object& JLOG(context.j.debug()) << "RPC call " << name << " completed in " << ((end - start).count() / 1000000000.0) << "seconds"; perfLog.rpcFinish(name, curId); + XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "success"); return ret; } catch (std::exception& e) { perfLog.rpcError(name, curId); JLOG(context.j.info()) << "Caught throw: " << e.what(); + XRPL_TRACE_EXCEPTION(e); + XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "error"); if (context.loadType == Resource::feeReferenceRPC) context.loadType = Resource::feeExceptionRPC; diff --git a/src/xrpld/rpc/detail/ServerHandler.cpp b/src/xrpld/rpc/detail/ServerHandler.cpp index 5be41a5a23..4785df3b0f 100644 --- a/src/xrpld/rpc/detail/ServerHandler.cpp +++ b/src/xrpld/rpc/detail/ServerHandler.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -267,6 +268,8 @@ buffers_to_string(ConstBufferSequence const& bs) void ServerHandler::onRequest(Session& session) { + XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.request"); + // Make sure RPC is enabled on the port if (session.port().protocol.count("http") == 0 && session.port().protocol.count("https") == 0) { @@ -378,6 +381,7 @@ ServerHandler::processSession( std::shared_ptr const& coro, Json::Value const& jv) { + XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.ws_message"); auto is = std::static_pointer_cast(session->appDefined); if (is->getConsumer().disconnect(m_journal)) { @@ -566,6 +570,7 @@ ServerHandler::processRequest( std::string_view forwardedFor, std::string_view user) { + XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.process"); auto rpcJ = app_.journal("RPC"); Json::Value jsonOrig; diff --git a/src/xrpld/telemetry/TracingInstrumentation.h b/src/xrpld/telemetry/TracingInstrumentation.h new file mode 100644 index 0000000000..45838bba0a --- /dev/null +++ b/src/xrpld/telemetry/TracingInstrumentation.h @@ -0,0 +1,69 @@ +#pragma once + +#ifdef XRPL_ENABLE_TELEMETRY + +#include +#include + +#include + +namespace xrpl { +namespace telemetry { + +// Start a span that is automatically ended when guard goes out of scope +#define XRPL_TRACE_SPAN(_tel_obj_, _span_name_) \ + auto _xrpl_span_ = (_tel_obj_).startSpan(_span_name_); \ + ::xrpl::telemetry::SpanGuard _xrpl_guard_(_xrpl_span_) + +// Start a span with specific kind +#define XRPL_TRACE_SPAN_KIND(_tel_obj_, _span_name_, _span_kind_) \ + auto _xrpl_span_ = (_tel_obj_).startSpan(_span_name_, _span_kind_); \ + ::xrpl::telemetry::SpanGuard _xrpl_guard_(_xrpl_span_) + +// Conditional span for RPC tracing +#define XRPL_TRACE_RPC(_tel_obj_, _span_name_) \ + std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ + if ((_tel_obj_).shouldTraceRpc()) { \ + _xrpl_guard_.emplace((_tel_obj_).startSpan(_span_name_)); \ + } + +// Conditional span for transaction tracing +#define XRPL_TRACE_TX(_tel_obj_, _span_name_) \ + std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ + if ((_tel_obj_).shouldTraceTransactions()) { \ + _xrpl_guard_.emplace((_tel_obj_).startSpan(_span_name_)); \ + } + +// Conditional span for consensus tracing +#define XRPL_TRACE_CONSENSUS(_tel_obj_, _span_name_) \ + std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ + if ((_tel_obj_).shouldTraceConsensus()) { \ + _xrpl_guard_.emplace((_tel_obj_).startSpan(_span_name_)); \ + } + +// Set attribute on current span (if exists) +#define XRPL_TRACE_SET_ATTR(key, value) \ + if (_xrpl_guard_.has_value()) { \ + _xrpl_guard_->setAttribute(key, value); \ + } + +// Record exception on current span +#define XRPL_TRACE_EXCEPTION(e) \ + if (_xrpl_guard_.has_value()) { \ + _xrpl_guard_->recordException(e); \ + } + +} // namespace telemetry +} // namespace xrpl + +#else // XRPL_ENABLE_TELEMETRY not defined + +#define XRPL_TRACE_SPAN(_tel_obj_, _span_name_) ((void)0) +#define XRPL_TRACE_SPAN_KIND(_tel_obj_, _span_name_, _span_kind_) ((void)0) +#define XRPL_TRACE_RPC(_tel_obj_, _span_name_) ((void)0) +#define XRPL_TRACE_TX(_tel_obj_, _span_name_) ((void)0) +#define XRPL_TRACE_CONSENSUS(_tel_obj_, _span_name_) ((void)0) +#define XRPL_TRACE_SET_ATTR(key, value) ((void)0) +#define XRPL_TRACE_EXCEPTION(e) ((void)0) + +#endif // XRPL_ENABLE_TELEMETRY