diff --git a/OpenTelemetryPlan/01-architecture-analysis.md b/OpenTelemetryPlan/01-architecture-analysis.md index c62ac3454c..fde1833349 100644 --- a/OpenTelemetryPlan/01-architecture-analysis.md +++ b/OpenTelemetryPlan/01-architecture-analysis.md @@ -247,14 +247,14 @@ flowchart TB subgraph request["rpc.request (root span)"] http["HTTP Request — POST /
traceparent:
00-abc123...-def456...-01"] - attrs["Attributes:
http.method = POST
net.peer.ip = 192.168.1.100
xrpl.rpc.command = submit"] + attrs["Attributes:
http.method = POST
net.peer.ip = 192.168.1.100
command = submit"] subgraph enqueue["jobqueue.enqueue"] job_attr["xrpl.job.type = jtCLIENT_RPC"] end subgraph command["rpc.command.submit"] - cmd_attrs["xrpl.rpc.version = 2
xrpl.rpc.role = user"] + cmd_attrs["version = 2
rpc_role = user"] cmd_children["├── tx.deserialize
├── tx.validate_local
└── tx.submit_to_network"] end @@ -359,7 +359,7 @@ After implementing OpenTelemetry, operators and developers will gain visibility | **Transaction Lifecycle** | Full journey from RPC submission through validation, relay, consensus, and ledger inclusion | `{service.name="xrpld" && xrpl.tx.hash="ABC123..."}` | | **Cross-Node Propagation** | Transaction path across multiple xrpld nodes with timing | `{xrpl.tx.relay_count > 0}` | | **Consensus Rounds** | Complete round with all phases (open, establish, accept) | `{span.name=~"consensus.round.*"}` | -| **RPC Request Processing** | Individual command execution with timing breakdown | `{xrpl.rpc.command="account_info"}` | +| **RPC Request Processing** | Individual command execution with timing breakdown | `{command="account_info"}` | | **Ledger Acquisition** | Peer-to-peer ledger data requests and responses | `{span.name="ledger.acquire"}` | | **PathFinding Latency** | Path computation time and cache effectiveness for payment RPCs | `{span.name="pathfind.compute"}` | | **TxQ Behavior** | Queue depth, eviction patterns, fee escalation during congestion | `{span.name=~"txq.*"}` | @@ -458,7 +458,7 @@ xychart-beta 1. **Find Transaction**: Query by `xrpl.tx.hash` to get full trace 2. **Identify Bottleneck**: Look at span durations to find slowest component -3. **Check Attributes**: Review `xrpl.tx.validity`, `xrpl.rpc.status` for errors +3. **Check Attributes**: Review `xrpl.tx.validity`, `rpc_status` for errors 4. **Correlate Logs**: Use `trace_id` to find related PerfLog entries 5. **Compare Nodes**: Filter by `service.instance.id` to compare behavior across nodes diff --git a/OpenTelemetryPlan/02-design-decisions.md b/OpenTelemetryPlan/02-design-decisions.md index fe87fc78db..7b9f4dd140 100644 --- a/OpenTelemetryPlan/02-design-decisions.md +++ b/OpenTelemetryPlan/02-design-decisions.md @@ -244,10 +244,10 @@ resource::SemanticConventions::SERVICE_INSTANCE_ID = #### RPC Attributes ```cpp -"xrpl.rpc.command" = string // Command name -"xrpl.rpc.version" = int64 // API version -"xrpl.rpc.role" = string // "admin" or "user" -"xrpl.rpc.params" = string // Sanitized parameters (optional) +"command" = string // Command name +"version" = int64 // API version +"rpc_role" = string // "admin" or "user" +"xrpl.rpc.params" = string // Sanitized parameters (optional, planned) ``` #### Peer & Message Attributes diff --git a/OpenTelemetryPlan/03-implementation-strategy.md b/OpenTelemetryPlan/03-implementation-strategy.md index 9a4baf7131..61e522719b 100644 --- a/OpenTelemetryPlan/03-implementation-strategy.md +++ b/OpenTelemetryPlan/03-implementation-strategy.md @@ -490,11 +490,11 @@ void ServerHandler::onRequest(...) { // After (only ~4 lines added) void ServerHandler::onRequest(...) { auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); // +1 line - span.setAttribute("xrpl.rpc.command", command); // +1 line + span.setAttribute("command", command); // +1 line auto result = processRequest(req); - span.setAttribute("xrpl.rpc.status", status); // +1 line + span.setAttribute("rpc_status", status); // +1 line send(result); } ``` diff --git a/OpenTelemetryPlan/04-code-samples.md b/OpenTelemetryPlan/04-code-samples.md index 9a637c0c05..d4d5c0bdc0 100644 --- a/OpenTelemetryPlan/04-code-samples.md +++ b/OpenTelemetryPlan/04-code-samples.md @@ -346,11 +346,11 @@ void ServerHandler::onRequest(...) // Factory creates a span if RPC tracing is enabled, no-op otherwise. // No Telemetry& reference needed -- accessed via global singleton. auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); - span.setAttribute("xrpl.rpc.command", command); + span.setAttribute("command", command); auto result = processRequest(req); - span.setAttribute("xrpl.rpc.status", result.status()); + span.setAttribute("rpc_status", result.status()); span.setOk(); // span ended automatically when it goes out of scope } @@ -841,7 +841,7 @@ ServerHandler::onRequest( ? jv["method"].asString() : "unknown"; - span.setAttribute("xrpl.rpc.command", command); + span.setAttribute("command", command); // Create child span for command execution { @@ -854,7 +854,7 @@ ServerHandler::onRequest( // Record result attributes if (result.isMember("status")) { - cmdSpan.setAttribute("xrpl.rpc.status", + cmdSpan.setAttribute("rpc_status", result["status"].asString()); } diff --git a/OpenTelemetryPlan/05-configuration-reference.md b/OpenTelemetryPlan/05-configuration-reference.md index 1f56a7abf0..6c94e16513 100644 --- a/OpenTelemetryPlan/05-configuration-reference.md +++ b/OpenTelemetryPlan/05-configuration-reference.md @@ -480,7 +480,7 @@ processors: - name: rpc-spans type: string_attribute string_attribute: - key: xrpl.rpc.command + key: command values: [".*"] enabled_regex_matching: true - name: latency @@ -738,7 +738,7 @@ providers: "targets": [ { "queryType": "traceql", - "query": "{resource.service.name=\"xrpld\" && span.xrpl.rpc.command != \"\"} | histogram_over_time(duration) by (span.xrpl.rpc.command)" + "query": "{resource.service.name=\"xrpld\" && span.command != \"\"} | histogram_over_time(duration) by (span.command)" } ], "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 } @@ -750,7 +750,7 @@ providers: "targets": [ { "queryType": "traceql", - "query": "{resource.service.name=\"xrpld\" && status.code=error} | rate() by (span.xrpl.rpc.command)" + "query": "{resource.service.name=\"xrpld\" && status.code=error} | rate() by (span.command)" } ], "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 } @@ -762,7 +762,7 @@ providers: "targets": [ { "queryType": "traceql", - "query": "{resource.service.name=\"xrpld\" && span.xrpl.rpc.command != \"\"} | avg(duration) by (span.xrpl.rpc.command) | topk(10)" + "query": "{resource.service.name=\"xrpld\" && span.command != \"\"} | avg(duration) by (span.command) | topk(10)" } ], "gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 } diff --git a/OpenTelemetryPlan/POC_taskList.md b/OpenTelemetryPlan/POC_taskList.md index 8cd390ef5b..5f93886200 100644 --- a/OpenTelemetryPlan/POC_taskList.md +++ b/OpenTelemetryPlan/POC_taskList.md @@ -302,8 +302,8 @@ // Each factory checks the global Telemetry instance internally. // No Telemetry& reference needed at the call site. auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); - span.setAttribute("xrpl.rpc.command", command); - span.setAttribute("xrpl.rpc.status", status); + span.setAttribute("command", command); + span.setAttribute("rpc_status", status); ``` - Factory methods: `rpcSpan()`, `txSpan()`, `consensusSpan()`, `peerSpan()`, `ledgerSpan()`, `span()` @@ -336,12 +336,12 @@ - `#include ` - In `ServerHandler::onRequest(Session& session)`: - At the top of the method, add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.request");` - - After the RPC command name is extracted, set attribute: `span.setAttribute("xrpl.rpc.command", command);` + - After the RPC command name is extracted, set attribute: `span.setAttribute("command", command);` - After the response status is known, set: `span.setAttribute("http.status_code", static_cast(statusCode));` - Wrap error paths with: `span.recordException(e);` - In `ServerHandler::processRequest(...)`: - Add a child span: `auto span = telemetry::SpanGuard::rpcSpan("rpc.process");` - - Set method attribute: `span.setAttribute("xrpl.rpc.method", request_method);` + - Set method attribute: `span.setAttribute("method", request_method);` - In `ServerHandler::onWSMessage(...)` (WebSocket path): - Add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.ws.message");` @@ -362,7 +362,7 @@ - [01-architecture-analysis.md §1.5](./01-architecture-analysis.md) — RPC request flow diagram: HTTP request -> attributes -> jobqueue.enqueue -> rpc.command -> response - [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.request` in `ServerHandler.cpp::onRequest()` (Priority: High) - [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming convention: `rpc.request`, `rpc.command.*` -- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.params` +- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `command`, `version`, `rpc_role`, `xrpl.rpc.params` - [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact: `ServerHandler.cpp` ~40 lines added, ~10 changed (Low risk) --- @@ -378,17 +378,17 @@ - In `doCommand(RPC::JsonContext& context, Json::Value& result)`: - At the top: `auto span = telemetry::SpanGuard::rpcSpan("rpc.command." + context.method);` - Set attributes: - - `span.setAttribute("xrpl.rpc.command", context.method);` - - `span.setAttribute("xrpl.rpc.version", static_cast(context.apiVersion));` - - `span.setAttribute("xrpl.rpc.role", (context.role == Role::ADMIN) ? "admin" : "user");` - - On success: `span.setAttribute("xrpl.rpc.status", "success");` - - On error: `span.setAttribute("xrpl.rpc.status", "error");` and set the error message + - `span.setAttribute("command", context.method);` + - `span.setAttribute("version", static_cast(context.apiVersion));` + - `span.setAttribute("rpc_role", (context.role == Role::ADMIN) ? "admin" : "user");` + - On success: `span.setAttribute("rpc_status", "success");` + - On error: `span.setAttribute("rpc_status", "error");` and set the error message - After this, traces in Tempo/Grafana should look like: ``` - rpc.request (xrpl.rpc.command=account_info) + rpc.request (command=account_info) └── rpc.process - └── rpc.command.account_info (xrpl.rpc.version=2, xrpl.rpc.role=user, xrpl.rpc.status=success) + └── rpc.command.account_info (version=2, rpc_role=user, rpc_status=success) ``` **Key modified file**: @@ -399,7 +399,7 @@ - [04-code-samples.md §4.5.3](./04-code-samples.md) — `ServerHandler::onRequest()` code sample (includes child span pattern for `rpc.command.*`) - [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming: `rpc.command.*` pattern with dynamic command name (e.g., `rpc.command.server_info`) -- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.status` +- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `command`, `version`, `rpc_role`, `rpc_status` - [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.command.*` in `RPCHandler.cpp::doCommand()` (Priority: High) - [02-design-decisions.md §2.6.5](./02-design-decisions.md) — Correlation with PerfLog: how `doCommand()` can link trace_id with existing PerfLog entries - [03-implementation-strategy.md §3.4.4](./03-implementation-strategy.md) — RPC request overhead budget: ~1.75 μs total per request @@ -472,7 +472,7 @@ - Navigate to Explore → select Tempo datasource - Search for service `xrpld` - Confirm you see traces with spans: `rpc.request` -> `rpc.process` -> `rpc.command.server_info` - - Click into a trace and verify attributes: `xrpl.rpc.command`, `xrpl.rpc.status`, `xrpl.rpc.version` + - Click into a trace and verify attributes: `command`, `rpc_status`, `version` 7. **Verify zero-overhead when disabled**: - Rebuild with `XRPL_ENABLE_TELEMETRY=OFF`, or set `enabled=0` in config @@ -486,7 +486,7 @@ - [ ] xrpld starts and connects to OTel Collector (check xrpld logs for telemetry messages) - [ ] Traces appear in Grafana/Tempo under service "xrpld" - [ ] Span hierarchy is correct (parent-child relationships) -- [ ] Span attributes are populated (`xrpl.rpc.command`, `xrpl.rpc.status`, etc.) +- [ ] Span attributes are populated (`command`, `rpc_status`, etc.) - [ ] Error spans show error status and message - [ ] Building with `XRPL_ENABLE_TELEMETRY=OFF` produces no regressions - [ ] Setting `enabled=0` at runtime produces no traces and no errors @@ -572,8 +572,8 @@ The current POC exports **traces only**. Grafana's Explore view can query Tempo explicit: buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s] dimensions: - - name: xrpl.rpc.command - - name: xrpl.rpc.status + - name: command + - name: rpc_status exporters: prometheus: diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index 1d69e01a43..090ba602ed 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -50,7 +50,7 @@ if (telemetry.isEnabled() && telemetry.shouldTraceRpc()) { SpanGuard guard(telemetry.startSpan("rpc.command.submit")); - guard.setAttribute("xrpl.rpc.command", "submit"); + guard.setAttribute("command", "submit"); // ... guard ends span automatically on scope exit } @endcode