From 18bb528bc5b3805b8e4e795a7fe45fda23a0b749 Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 20 Mar 2026 17:22:12 +0000 Subject: [PATCH] Phase 1c: RPC integration - ServerHandler tracing, telemetry config wiring Co-Authored-By: Claude Opus 4.6 --- .../scripts/levelization/results/ordering.txt | 2 + presentation.md | 280 ++++++++++++++++++ src/xrpld/rpc/detail/RPCHandler.cpp | 9 + src/xrpld/rpc/detail/ServerHandler.cpp | 5 + src/xrpld/telemetry/TracingInstrumentation.h | 115 +++++++ 5 files changed, 411 insertions(+) create mode 100644 presentation.md create mode 100644 src/xrpld/telemetry/TracingInstrumentation.h diff --git a/.github/scripts/levelization/results/ordering.txt b/.github/scripts/levelization/results/ordering.txt index cb35a7e6cc..4187e99657 100644 --- a/.github/scripts/levelization/results/ordering.txt +++ b/.github/scripts/levelization/results/ordering.txt @@ -268,6 +268,7 @@ xrpld.perflog > xrpl.json xrpld.rpc > xrpl.basics xrpld.rpc > xrpl.core xrpld.rpc > xrpld.core +xrpld.rpc > xrpld.telemetry xrpld.rpc > xrpl.json xrpld.rpc > xrpl.ledger xrpld.rpc > xrpl.net @@ -278,3 +279,4 @@ xrpld.rpc > xrpl.resource xrpld.rpc > xrpl.server xrpld.rpc > xrpl.tx xrpld.shamap > xrpl.shamap +xrpld.telemetry > xrpl.telemetry diff --git a/presentation.md b/presentation.md new file mode 100644 index 0000000000..7a443a635c --- /dev/null +++ b/presentation.md @@ -0,0 +1,280 @@ +# OpenTelemetry Distributed Tracing for rippled + +--- + +## Slide 1: Introduction + +### What is OpenTelemetry? + +OpenTelemetry is an open-source, CNCF-backed observability framework for distributed tracing, metrics, and logs. + +### Why OpenTelemetry for rippled? + +- **End-to-End Transaction Visibility**: Track transactions from submission → consensus → ledger inclusion +- **Cross-Node Correlation**: Follow requests across multiple independent nodes using a unique `trace_id` +- **Consensus Round Analysis**: Understand timing and behavior across validators +- **Incident Debugging**: Correlate events across distributed nodes during issues + +```mermaid +flowchart LR + A["Node A
tx.receive
trace_id: abc123"] --> B["Node B
tx.relay
trace_id: abc123"] --> C["Node C
tx.validate
trace_id: abc123"] --> D["Node D
ledger.apply
trace_id: abc123"] + + style A fill:#1565c0,stroke:#0d47a1,color:#fff + style B fill:#2e7d32,stroke:#1b5e20,color:#fff + style C fill:#2e7d32,stroke:#1b5e20,color:#fff + style D fill:#e65100,stroke:#bf360c,color:#fff +``` + +> **Trace ID: abc123** — All nodes share the same trace, enabling cross-node correlation. + +--- + +## Slide 2: OpenTelemetry vs Open Source Alternatives + +| Feature | OpenTelemetry | Jaeger | Zipkin | SkyWalking | Pinpoint | Prometheus | +| ------------------- | ---------------- | ---------------- | ------------------ | ---------- | ---------- | ---------- | +| **Tracing** | YES | YES | YES | YES | YES | NO | +| **Metrics** | YES | NO | NO | YES | YES | YES | +| **Logs** | YES | NO | NO | YES | NO | NO | +| **C++ SDK** | YES Official | YES (Deprecated) | YES (Unmaintained) | NO | NO | YES | +| **Vendor Neutral** | YES Primary goal | NO | NO | NO | NO | NO | +| **Instrumentation** | Manual + Auto | Manual | Manual | Auto-first | Auto-first | Manual | +| **Backend** | Any (exporters) | Self | Self | Self | Self | Self | +| **CNCF Status** | Incubating | Graduated | NO | Incubating | NO | Graduated | + +> **Why OpenTelemetry?** It's the only actively maintained, full-featured C++ option with vendor neutrality — allowing export to Jaeger, Prometheus, Grafana, or any commercial backend without changing instrumentation. + +--- + +## Slide 3: Comparison with rippled's Existing Solutions + +### Current Observability Stack + +| Aspect | PerfLog (JSON) | StatsD (Metrics) | OpenTelemetry (NEW) | +| --------------------- | --------------------- | --------------------- | --------------------------- | +| **Type** | Logging | Metrics | Distributed Tracing | +| **Scope** | Single node | Single node | **Cross-node** | +| **Data** | JSON log entries | Counters, gauges | Spans with context | +| **Correlation** | By timestamp | By metric name | By `trace_id` | +| **Overhead** | Low (file I/O) | Low (UDP) | Low-Medium (configurable) | +| **Question Answered** | "What happened here?" | "How many? How fast?" | **"What was the journey?"** | + +### Use Case Matrix + +| Scenario | PerfLog | StatsD | OpenTelemetry | +| -------------------------------- | ------- | ------ | ------------- | +| "How many TXs per second?" | ❌ | ✅ | ❌ | +| "Why was this specific TX slow?" | ⚠️ | ❌ | ✅ | +| "Which node delayed consensus?" | ❌ | ❌ | ✅ | +| "Show TX journey across 5 nodes" | ❌ | ❌ | ✅ | + +> **Key Insight**: OpenTelemetry **complements** (not replaces) existing systems. + +--- + +## Slide 4: Architecture + +### High-Level Integration Architecture + +```mermaid +flowchart TB + subgraph rippled["rippled Node"] + subgraph services["Core Services"] + direction LR + RPC["RPC Server
(HTTP/WS)"] ~~~ Overlay["Overlay
(P2P Network)"] ~~~ Consensus["Consensus
(RCLConsensus)"] + end + + Telemetry["Telemetry Module
(OpenTelemetry SDK)"] + + services --> Telemetry + end + + Telemetry -->|OTLP/gRPC| Collector["OTel Collector"] + + Collector --> Tempo["Grafana Tempo"] + Collector --> Jaeger["Jaeger"] + Collector --> Elastic["Elastic APM"] + + style rippled fill:#424242,stroke:#212121,color:#fff + style services fill:#1565c0,stroke:#0d47a1,color:#fff + style Telemetry fill:#2e7d32,stroke:#1b5e20,color:#fff + style Collector fill:#e65100,stroke:#bf360c,color:#fff +``` + +### Context Propagation + +```mermaid +sequenceDiagram + participant Client + participant NodeA as Node A + participant NodeB as Node B + + Client->>NodeA: Submit TX (no context) + Note over NodeA: Creates trace_id: abc123
span: tx.receive + NodeA->>NodeB: Relay TX
(traceparent: abc123) + Note over NodeB: Links to trace_id: abc123
span: tx.relay +``` + +- **HTTP/RPC**: W3C Trace Context headers (`traceparent`) +- **P2P Messages**: Protocol Buffer extension fields + +--- + +## Slide 5: Implementation Plan + +### 5-Phase Rollout (9 Weeks) + +```mermaid +gantt + title Implementation Timeline + dateFormat YYYY-MM-DD + axisFormat Week %W + + section Phase 1 + Core Infrastructure :p1, 2024-01-01, 2w + + section Phase 2 + RPC Tracing :p2, after p1, 2w + + section Phase 3 + Transaction Tracing :p3, after p2, 2w + + section Phase 4 + Consensus Tracing :p4, after p3, 2w + + section Phase 5 + Documentation :p5, after p4, 1w +``` + +### Phase Details + +| Phase | Focus | Key Deliverables | Effort | +| ----- | ------------------- | -------------------------------------------- | ------- | +| 1 | Core Infrastructure | SDK integration, Telemetry interface, Config | 10 days | +| 2 | RPC Tracing | HTTP context extraction, Handler spans | 10 days | +| 3 | Transaction Tracing | Protobuf context, P2P relay propagation | 10 days | +| 4 | Consensus Tracing | Round spans, Proposal/validation tracing | 10 days | +| 5 | Documentation | Runbook, Dashboards, Training | 7 days | + +**Total Effort**: ~47 developer-days (2 developers) + +--- + +## Slide 6: Performance Overhead + +### Estimated System Impact + +| Metric | Overhead | Notes | +| ----------------- | ---------- | ----------------------------------- | +| **CPU** | 1-3% | Span creation and attribute setting | +| **Memory** | 2-5 MB | Batch buffer for pending spans | +| **Network** | 10-50 KB/s | Compressed OTLP export to collector | +| **Latency (p99)** | <2% | With proper sampling configuration | + +### Per-Message Overhead (Context Propagation) + +Each P2P message carries trace context with the following overhead: + +| Field | Size | Description | +| ------------- | ------------- | ----------------------------------------- | +| `trace_id` | 16 bytes | Unique identifier for the entire trace | +| `span_id` | 8 bytes | Current span (becomes parent on receiver) | +| `trace_flags` | 4 bytes | Sampling decision flags | +| `trace_state` | 0-4 bytes | Optional vendor-specific data | +| **Total** | **~32 bytes** | **Added per traced P2P message** | + +```mermaid +flowchart LR + subgraph msg["P2P Message with Trace Context"] + A["Original Message
(variable size)"] --> B["+ TraceContext
(~32 bytes)"] + end + + subgraph breakdown["Context Breakdown"] + C["trace_id
16 bytes"] + D["span_id
8 bytes"] + E["flags
4 bytes"] + F["state
0-4 bytes"] + end + + B --> breakdown + + style A fill:#424242,stroke:#212121,color:#fff + style B fill:#2e7d32,stroke:#1b5e20,color:#fff + style C fill:#1565c0,stroke:#0d47a1,color:#fff + style D fill:#1565c0,stroke:#0d47a1,color:#fff + style E fill:#e65100,stroke:#bf360c,color:#fff + style F fill:#4a148c,stroke:#2e0d57,color:#fff +``` + +> **Note**: 32 bytes is negligible compared to typical transaction messages (hundreds to thousands of bytes) + +### Mitigation Strategies + +```mermaid +flowchart LR + A["Head Sampling
10% default"] --> B["Tail Sampling
Keep errors/slow"] --> C["Batch Export
Reduce I/O"] --> D["Conditional Compile
XRPL_ENABLE_TELEMETRY"] + + style A fill:#1565c0,stroke:#0d47a1,color:#fff + style B fill:#2e7d32,stroke:#1b5e20,color:#fff + style C fill:#e65100,stroke:#bf360c,color:#fff + style D fill:#4a148c,stroke:#2e0d57,color:#fff +``` + +### Kill Switches (Rollback Options) + +1. **Config Disable**: Set `enabled=0` in config → instant disable, no restart needed for sampling +2. **Rebuild**: Compile with `XRPL_ENABLE_TELEMETRY=OFF` → zero overhead (no-op) +3. **Full Revert**: Clean separation allows easy commit reversion + +--- + +## Slide 7: Data Collection & Privacy + +### What Data is Collected + +| Category | Attributes Collected | Purpose | +| --------------- | ---------------------------------------------------------------------------------- | --------------------------- | +| **Transaction** | `tx.hash`, `tx.type`, `tx.result`, `tx.fee`, `ledger_index` | Trace transaction lifecycle | +| **Consensus** | `round`, `phase`, `mode`, `proposers`(public key or public node id), `duration_ms` | Analyze consensus timing | +| **RPC** | `command`, `version`, `status`, `duration_ms` | Monitor RPC performance | +| **Peer** | `peer.id`(public key), `latency_ms`, `message.type`, `message.size` | Network topology analysis | +| **Ledger** | `ledger.hash`, `ledger.index`, `close_time`, `tx_count` | Ledger progression tracking | +| **Job** | `job.type`, `queue_ms`, `worker` | JobQueue performance | + +### What is NOT Collected (Privacy Guarantees) + +```mermaid +flowchart LR + subgraph notCollected["❌ NOT Collected"] + direction LR + A["Private Keys"] ~~~ B["Account Balances"] ~~~ C["Transaction Amounts"] + end + + subgraph alsoNot["❌ Also Excluded"] + direction LR + D["IP Addresses
(configurable)"] ~~~ E["Personal Data"] ~~~ F["Raw TX Payloads"] + end + + style A fill:#c62828,stroke:#8c2809,color:#fff + style B fill:#c62828,stroke:#8c2809,color:#fff + style C fill:#c62828,stroke:#8c2809,color:#fff + style D fill:#c62828,stroke:#8c2809,color:#fff + style E fill:#c62828,stroke:#8c2809,color:#fff + style F fill:#c62828,stroke:#8c2809,color:#fff +``` + +### Privacy Protection Mechanisms + +| Mechanism | Description | +| -------------------------- | ------------------------------------------------------------- | +| **Account Hashing** | `xrpl.tx.account` is hashed at collector level before storage | +| **Configurable Redaction** | Sensitive fields can be excluded via config | +| **Sampling** | Only 10% of traces recorded by default (reduces exposure) | +| **Local Control** | Node operators control what gets exported | +| **No Raw Payloads** | Transaction content is never recorded, only metadata | + +> **Key Principle**: Telemetry collects **operational metadata** (timing, counts, hashes) — never **sensitive content** (keys, balances, amounts). + +--- + +_End of Presentation_ diff --git a/src/xrpld/rpc/detail/RPCHandler.cpp b/src/xrpld/rpc/detail/RPCHandler.cpp index 1d8e1168b4..b19a2ba184 100644 --- a/src/xrpld/rpc/detail/RPCHandler.cpp +++ b/src/xrpld/rpc/detail/RPCHandler.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -157,6 +158,11 @@ template Status callMethod(JsonContext& context, Method method, std::string const& name, Object& result) { + XRPL_TRACE_RPC(context.app.getTelemetry(), "rpc.command." + name); + XRPL_TRACE_SET_ATTR("xrpl.rpc.command", name.c_str()); + XRPL_TRACE_SET_ATTR("xrpl.rpc.version", static_cast(context.apiVersion)); + XRPL_TRACE_SET_ATTR("xrpl.rpc.role", (context.role == Role::ADMIN ? "admin" : "user")); + static std::atomic requestId{0}; auto& perfLog = context.app.getPerfLog(); std::uint64_t const curId = ++requestId; @@ -172,12 +178,15 @@ callMethod(JsonContext& context, Method method, std::string const& name, Object& JLOG(context.j.debug()) << "RPC call " << name << " completed in " << ((end - start).count() / 1000000000.0) << "seconds"; perfLog.rpcFinish(name, curId); + XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "success"); return ret; } catch (std::exception& e) { perfLog.rpcError(name, curId); JLOG(context.j.info()) << "Caught throw: " << e.what(); + XRPL_TRACE_EXCEPTION(e); + XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "error"); if (context.loadType == Resource::feeReferenceRPC) context.loadType = Resource::feeExceptionRPC; diff --git a/src/xrpld/rpc/detail/ServerHandler.cpp b/src/xrpld/rpc/detail/ServerHandler.cpp index e5cc7a83bf..c3460333f1 100644 --- a/src/xrpld/rpc/detail/ServerHandler.cpp +++ b/src/xrpld/rpc/detail/ServerHandler.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -266,6 +267,8 @@ buffers_to_string(ConstBufferSequence const& bs) void ServerHandler::onRequest(Session& session) { + XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.request"); + // Make sure RPC is enabled on the port if (session.port().protocol.count("http") == 0 && session.port().protocol.count("https") == 0) { @@ -381,6 +384,7 @@ ServerHandler::processSession( std::shared_ptr const& coro, Json::Value const& jv) { + XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.ws_message"); auto is = std::static_pointer_cast(session->appDefined); if (is->getConsumer().disconnect(m_journal)) { @@ -573,6 +577,7 @@ ServerHandler::processRequest( std::string_view forwardedFor, std::string_view user) { + XRPL_TRACE_RPC(app_.getTelemetry(), "rpc.process"); auto rpcJ = app_.getJournal("RPC"); Json::Value jsonOrig; diff --git a/src/xrpld/telemetry/TracingInstrumentation.h b/src/xrpld/telemetry/TracingInstrumentation.h new file mode 100644 index 0000000000..d7d2ecf912 --- /dev/null +++ b/src/xrpld/telemetry/TracingInstrumentation.h @@ -0,0 +1,115 @@ +#pragma once + +/** Convenience macros for instrumenting code with OpenTelemetry trace spans. + + When XRPL_ENABLE_TELEMETRY is defined, the macros create SpanGuard objects + that manage span lifetime via RAII. When not defined, all macros expand to + ((void)0) with zero overhead. + + Usage in instrumented code: + @code + XRPL_TRACE_RPC(app.getTelemetry(), "rpc.command." + name); + XRPL_TRACE_SET_ATTR("xrpl.rpc.command", name); + XRPL_TRACE_SET_ATTR("xrpl.rpc.status", "success"); + @endcode + + @note Macro parameter names use leading/trailing underscores + (e.g. _tel_obj_) to avoid colliding with identifiers in the macro body, + specifically the ::xrpl::telemetry:: namespace qualifier. +*/ + +#ifdef XRPL_ENABLE_TELEMETRY + +#include +#include + +#include + +namespace xrpl { +namespace telemetry { + +/** Start an unconditional span, ended when the guard goes out of scope. + @param _tel_obj_ Telemetry instance reference. + @param _span_name_ Span name string. +*/ +#define XRPL_TRACE_SPAN(_tel_obj_, _span_name_) \ + auto _xrpl_span_ = (_tel_obj_).startSpan(_span_name_); \ + ::xrpl::telemetry::SpanGuard _xrpl_guard_(_xrpl_span_) + +/** Start an unconditional span with a specific SpanKind. + @param _tel_obj_ Telemetry instance reference. + @param _span_name_ Span name string. + @param _span_kind_ opentelemetry::trace::SpanKind value. +*/ +#define XRPL_TRACE_SPAN_KIND(_tel_obj_, _span_name_, _span_kind_) \ + auto _xrpl_span_ = (_tel_obj_).startSpan(_span_name_, _span_kind_); \ + ::xrpl::telemetry::SpanGuard _xrpl_guard_(_xrpl_span_) + +/** Conditionally start a span for RPC tracing. + The span is only created if shouldTraceRpc() returns true. + @param _tel_obj_ Telemetry instance reference. + @param _span_name_ Span name string. +*/ +#define XRPL_TRACE_RPC(_tel_obj_, _span_name_) \ + std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ + if ((_tel_obj_).shouldTraceRpc()) \ + { \ + _xrpl_guard_.emplace((_tel_obj_).startSpan(_span_name_)); \ + } + +/** Conditionally start a span for transaction tracing. + The span is only created if shouldTraceTransactions() returns true. + @param _tel_obj_ Telemetry instance reference. + @param _span_name_ Span name string. +*/ +#define XRPL_TRACE_TX(_tel_obj_, _span_name_) \ + std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ + if ((_tel_obj_).shouldTraceTransactions()) \ + { \ + _xrpl_guard_.emplace((_tel_obj_).startSpan(_span_name_)); \ + } + +/** Conditionally start a span for consensus tracing. + The span is only created if shouldTraceConsensus() returns true. + @param _tel_obj_ Telemetry instance reference. + @param _span_name_ Span name string. +*/ +#define XRPL_TRACE_CONSENSUS(_tel_obj_, _span_name_) \ + std::optional<::xrpl::telemetry::SpanGuard> _xrpl_guard_; \ + if ((_tel_obj_).shouldTraceConsensus()) \ + { \ + _xrpl_guard_.emplace((_tel_obj_).startSpan(_span_name_)); \ + } + +/** Set a key-value attribute on the current span (if it exists). + Must be used after one of the XRPL_TRACE_* span macros. +*/ +#define XRPL_TRACE_SET_ATTR(key, value) \ + if (_xrpl_guard_.has_value()) \ + { \ + _xrpl_guard_->setAttribute(key, value); \ + } + +/** Record an exception on the current span and mark it as error. + Must be used after one of the XRPL_TRACE_* span macros. +*/ +#define XRPL_TRACE_EXCEPTION(e) \ + if (_xrpl_guard_.has_value()) \ + { \ + _xrpl_guard_->recordException(e); \ + } + +} // namespace telemetry +} // namespace xrpl + +#else // XRPL_ENABLE_TELEMETRY not defined + +#define XRPL_TRACE_SPAN(_tel_obj_, _span_name_) ((void)0) +#define XRPL_TRACE_SPAN_KIND(_tel_obj_, _span_name_, _span_kind_) ((void)0) +#define XRPL_TRACE_RPC(_tel_obj_, _span_name_) ((void)0) +#define XRPL_TRACE_TX(_tel_obj_, _span_name_) ((void)0) +#define XRPL_TRACE_CONSENSUS(_tel_obj_, _span_name_) ((void)0) +#define XRPL_TRACE_SET_ATTR(key, value) ((void)0) +#define XRPL_TRACE_EXCEPTION(e) ((void)0) + +#endif // XRPL_ENABLE_TELEMETRY