mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-02 08:17:13 +00:00
docs(telemetry): update plan docs for simplified RPC/gRPC attr naming
Update OpenTelemetryPlan docs and Telemetry.h doc example to reflect the renamed per-span attributes: xrpl.rpc.command -> command, xrpl.rpc.status -> rpc_status, xrpl.grpc.method -> method, etc. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -247,14 +247,14 @@ flowchart TB
|
||||
subgraph request["rpc.request (root span)"]
|
||||
http["HTTP Request — POST /<br/>traceparent:<br/>00-abc123...-def456...-01"]
|
||||
|
||||
attrs["Attributes:<br/>http.method = POST<br/>net.peer.ip = 192.168.1.100<br/>xrpl.rpc.command = submit"]
|
||||
attrs["Attributes:<br/>http.method = POST<br/>net.peer.ip = 192.168.1.100<br/>command = submit"]
|
||||
|
||||
subgraph enqueue["jobqueue.enqueue"]
|
||||
job_attr["xrpl.job.type = jtCLIENT_RPC"]
|
||||
end
|
||||
|
||||
subgraph command["rpc.command.submit"]
|
||||
cmd_attrs["xrpl.rpc.version = 2<br/>xrpl.rpc.role = user"]
|
||||
cmd_attrs["version = 2<br/>rpc_role = user"]
|
||||
cmd_children["├── tx.deserialize<br/>├── tx.validate_local<br/>└── tx.submit_to_network"]
|
||||
end
|
||||
|
||||
@@ -359,7 +359,7 @@ After implementing OpenTelemetry, operators and developers will gain visibility
|
||||
| **Transaction Lifecycle** | Full journey from RPC submission through validation, relay, consensus, and ledger inclusion | `{service.name="xrpld" && xrpl.tx.hash="ABC123..."}` |
|
||||
| **Cross-Node Propagation** | Transaction path across multiple xrpld nodes with timing | `{xrpl.tx.relay_count > 0}` |
|
||||
| **Consensus Rounds** | Complete round with all phases (open, establish, accept) | `{span.name=~"consensus.round.*"}` |
|
||||
| **RPC Request Processing** | Individual command execution with timing breakdown | `{xrpl.rpc.command="account_info"}` |
|
||||
| **RPC Request Processing** | Individual command execution with timing breakdown | `{command="account_info"}` |
|
||||
| **Ledger Acquisition** | Peer-to-peer ledger data requests and responses | `{span.name="ledger.acquire"}` |
|
||||
| **PathFinding Latency** | Path computation time and cache effectiveness for payment RPCs | `{span.name="pathfind.compute"}` |
|
||||
| **TxQ Behavior** | Queue depth, eviction patterns, fee escalation during congestion | `{span.name=~"txq.*"}` |
|
||||
@@ -458,7 +458,7 @@ xychart-beta
|
||||
|
||||
1. **Find Transaction**: Query by `xrpl.tx.hash` to get full trace
|
||||
2. **Identify Bottleneck**: Look at span durations to find slowest component
|
||||
3. **Check Attributes**: Review `xrpl.tx.validity`, `xrpl.rpc.status` for errors
|
||||
3. **Check Attributes**: Review `xrpl.tx.validity`, `rpc_status` for errors
|
||||
4. **Correlate Logs**: Use `trace_id` to find related PerfLog entries
|
||||
5. **Compare Nodes**: Filter by `service.instance.id` to compare behavior across nodes
|
||||
|
||||
|
||||
@@ -244,10 +244,10 @@ resource::SemanticConventions::SERVICE_INSTANCE_ID = <node_public_key_base58>
|
||||
#### RPC Attributes
|
||||
|
||||
```cpp
|
||||
"xrpl.rpc.command" = string // Command name
|
||||
"xrpl.rpc.version" = int64 // API version
|
||||
"xrpl.rpc.role" = string // "admin" or "user"
|
||||
"xrpl.rpc.params" = string // Sanitized parameters (optional)
|
||||
"command" = string // Command name
|
||||
"version" = int64 // API version
|
||||
"rpc_role" = string // "admin" or "user"
|
||||
"xrpl.rpc.params" = string // Sanitized parameters (optional, planned)
|
||||
```
|
||||
|
||||
#### Peer & Message Attributes
|
||||
|
||||
@@ -490,11 +490,11 @@ void ServerHandler::onRequest(...) {
|
||||
// After (only ~4 lines added)
|
||||
void ServerHandler::onRequest(...) {
|
||||
auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); // +1 line
|
||||
span.setAttribute("xrpl.rpc.command", command); // +1 line
|
||||
span.setAttribute("command", command); // +1 line
|
||||
|
||||
auto result = processRequest(req);
|
||||
|
||||
span.setAttribute("xrpl.rpc.status", status); // +1 line
|
||||
span.setAttribute("rpc_status", status); // +1 line
|
||||
send(result);
|
||||
}
|
||||
```
|
||||
|
||||
@@ -346,11 +346,11 @@ void ServerHandler::onRequest(...)
|
||||
// Factory creates a span if RPC tracing is enabled, no-op otherwise.
|
||||
// No Telemetry& reference needed -- accessed via global singleton.
|
||||
auto span = telemetry::SpanGuard::rpcSpan("rpc.request");
|
||||
span.setAttribute("xrpl.rpc.command", command);
|
||||
span.setAttribute("command", command);
|
||||
|
||||
auto result = processRequest(req);
|
||||
|
||||
span.setAttribute("xrpl.rpc.status", result.status());
|
||||
span.setAttribute("rpc_status", result.status());
|
||||
span.setOk();
|
||||
// span ended automatically when it goes out of scope
|
||||
}
|
||||
@@ -841,7 +841,7 @@ ServerHandler::onRequest(
|
||||
? jv["method"].asString()
|
||||
: "unknown";
|
||||
|
||||
span.setAttribute("xrpl.rpc.command", command);
|
||||
span.setAttribute("command", command);
|
||||
|
||||
// Create child span for command execution
|
||||
{
|
||||
@@ -854,7 +854,7 @@ ServerHandler::onRequest(
|
||||
// Record result attributes
|
||||
if (result.isMember("status"))
|
||||
{
|
||||
cmdSpan.setAttribute("xrpl.rpc.status",
|
||||
cmdSpan.setAttribute("rpc_status",
|
||||
result["status"].asString());
|
||||
}
|
||||
|
||||
|
||||
@@ -480,7 +480,7 @@ processors:
|
||||
- name: rpc-spans
|
||||
type: string_attribute
|
||||
string_attribute:
|
||||
key: xrpl.rpc.command
|
||||
key: command
|
||||
values: [".*"]
|
||||
enabled_regex_matching: true
|
||||
- name: latency
|
||||
@@ -738,7 +738,7 @@ providers:
|
||||
"targets": [
|
||||
{
|
||||
"queryType": "traceql",
|
||||
"query": "{resource.service.name=\"xrpld\" && span.xrpl.rpc.command != \"\"} | histogram_over_time(duration) by (span.xrpl.rpc.command)"
|
||||
"query": "{resource.service.name=\"xrpld\" && span.command != \"\"} | histogram_over_time(duration) by (span.command)"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }
|
||||
@@ -750,7 +750,7 @@ providers:
|
||||
"targets": [
|
||||
{
|
||||
"queryType": "traceql",
|
||||
"query": "{resource.service.name=\"xrpld\" && status.code=error} | rate() by (span.xrpl.rpc.command)"
|
||||
"query": "{resource.service.name=\"xrpld\" && status.code=error} | rate() by (span.command)"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }
|
||||
@@ -762,7 +762,7 @@ providers:
|
||||
"targets": [
|
||||
{
|
||||
"queryType": "traceql",
|
||||
"query": "{resource.service.name=\"xrpld\" && span.xrpl.rpc.command != \"\"} | avg(duration) by (span.xrpl.rpc.command) | topk(10)"
|
||||
"query": "{resource.service.name=\"xrpld\" && span.command != \"\"} | avg(duration) by (span.command) | topk(10)"
|
||||
}
|
||||
],
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 }
|
||||
|
||||
@@ -302,8 +302,8 @@
|
||||
// Each factory checks the global Telemetry instance internally.
|
||||
// No Telemetry& reference needed at the call site.
|
||||
auto span = telemetry::SpanGuard::rpcSpan("rpc.request");
|
||||
span.setAttribute("xrpl.rpc.command", command);
|
||||
span.setAttribute("xrpl.rpc.status", status);
|
||||
span.setAttribute("command", command);
|
||||
span.setAttribute("rpc_status", status);
|
||||
```
|
||||
|
||||
- Factory methods: `rpcSpan()`, `txSpan()`, `consensusSpan()`, `peerSpan()`, `ledgerSpan()`, `span()`
|
||||
@@ -336,12 +336,12 @@
|
||||
- `#include <xrpl/telemetry/SpanGuard.h>`
|
||||
- In `ServerHandler::onRequest(Session& session)`:
|
||||
- At the top of the method, add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.request");`
|
||||
- After the RPC command name is extracted, set attribute: `span.setAttribute("xrpl.rpc.command", command);`
|
||||
- After the RPC command name is extracted, set attribute: `span.setAttribute("command", command);`
|
||||
- After the response status is known, set: `span.setAttribute("http.status_code", static_cast<int64_t>(statusCode));`
|
||||
- Wrap error paths with: `span.recordException(e);`
|
||||
- In `ServerHandler::processRequest(...)`:
|
||||
- Add a child span: `auto span = telemetry::SpanGuard::rpcSpan("rpc.process");`
|
||||
- Set method attribute: `span.setAttribute("xrpl.rpc.method", request_method);`
|
||||
- Set method attribute: `span.setAttribute("method", request_method);`
|
||||
- In `ServerHandler::onWSMessage(...)` (WebSocket path):
|
||||
- Add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.ws.message");`
|
||||
|
||||
@@ -362,7 +362,7 @@
|
||||
- [01-architecture-analysis.md §1.5](./01-architecture-analysis.md) — RPC request flow diagram: HTTP request -> attributes -> jobqueue.enqueue -> rpc.command -> response
|
||||
- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.request` in `ServerHandler.cpp::onRequest()` (Priority: High)
|
||||
- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming convention: `rpc.request`, `rpc.command.*`
|
||||
- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.params`
|
||||
- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `command`, `version`, `rpc_role`, `xrpl.rpc.params`
|
||||
- [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact: `ServerHandler.cpp` ~40 lines added, ~10 changed (Low risk)
|
||||
|
||||
---
|
||||
@@ -378,17 +378,17 @@
|
||||
- In `doCommand(RPC::JsonContext& context, Json::Value& result)`:
|
||||
- At the top: `auto span = telemetry::SpanGuard::rpcSpan("rpc.command." + context.method);`
|
||||
- Set attributes:
|
||||
- `span.setAttribute("xrpl.rpc.command", context.method);`
|
||||
- `span.setAttribute("xrpl.rpc.version", static_cast<int64_t>(context.apiVersion));`
|
||||
- `span.setAttribute("xrpl.rpc.role", (context.role == Role::ADMIN) ? "admin" : "user");`
|
||||
- On success: `span.setAttribute("xrpl.rpc.status", "success");`
|
||||
- On error: `span.setAttribute("xrpl.rpc.status", "error");` and set the error message
|
||||
- `span.setAttribute("command", context.method);`
|
||||
- `span.setAttribute("version", static_cast<int64_t>(context.apiVersion));`
|
||||
- `span.setAttribute("rpc_role", (context.role == Role::ADMIN) ? "admin" : "user");`
|
||||
- On success: `span.setAttribute("rpc_status", "success");`
|
||||
- On error: `span.setAttribute("rpc_status", "error");` and set the error message
|
||||
|
||||
- After this, traces in Tempo/Grafana should look like:
|
||||
```
|
||||
rpc.request (xrpl.rpc.command=account_info)
|
||||
rpc.request (command=account_info)
|
||||
└── rpc.process
|
||||
└── rpc.command.account_info (xrpl.rpc.version=2, xrpl.rpc.role=user, xrpl.rpc.status=success)
|
||||
└── rpc.command.account_info (version=2, rpc_role=user, rpc_status=success)
|
||||
```
|
||||
|
||||
**Key modified file**:
|
||||
@@ -399,7 +399,7 @@
|
||||
|
||||
- [04-code-samples.md §4.5.3](./04-code-samples.md) — `ServerHandler::onRequest()` code sample (includes child span pattern for `rpc.command.*`)
|
||||
- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming: `rpc.command.*` pattern with dynamic command name (e.g., `rpc.command.server_info`)
|
||||
- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.status`
|
||||
- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `command`, `version`, `rpc_role`, `rpc_status`
|
||||
- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.command.*` in `RPCHandler.cpp::doCommand()` (Priority: High)
|
||||
- [02-design-decisions.md §2.6.5](./02-design-decisions.md) — Correlation with PerfLog: how `doCommand()` can link trace_id with existing PerfLog entries
|
||||
- [03-implementation-strategy.md §3.4.4](./03-implementation-strategy.md) — RPC request overhead budget: ~1.75 μs total per request
|
||||
@@ -472,7 +472,7 @@
|
||||
- Navigate to Explore → select Tempo datasource
|
||||
- Search for service `xrpld`
|
||||
- Confirm you see traces with spans: `rpc.request` -> `rpc.process` -> `rpc.command.server_info`
|
||||
- Click into a trace and verify attributes: `xrpl.rpc.command`, `xrpl.rpc.status`, `xrpl.rpc.version`
|
||||
- Click into a trace and verify attributes: `command`, `rpc_status`, `version`
|
||||
|
||||
7. **Verify zero-overhead when disabled**:
|
||||
- Rebuild with `XRPL_ENABLE_TELEMETRY=OFF`, or set `enabled=0` in config
|
||||
@@ -486,7 +486,7 @@
|
||||
- [ ] xrpld starts and connects to OTel Collector (check xrpld logs for telemetry messages)
|
||||
- [ ] Traces appear in Grafana/Tempo under service "xrpld"
|
||||
- [ ] Span hierarchy is correct (parent-child relationships)
|
||||
- [ ] Span attributes are populated (`xrpl.rpc.command`, `xrpl.rpc.status`, etc.)
|
||||
- [ ] Span attributes are populated (`command`, `rpc_status`, etc.)
|
||||
- [ ] Error spans show error status and message
|
||||
- [ ] Building with `XRPL_ENABLE_TELEMETRY=OFF` produces no regressions
|
||||
- [ ] Setting `enabled=0` at runtime produces no traces and no errors
|
||||
@@ -572,8 +572,8 @@ The current POC exports **traces only**. Grafana's Explore view can query Tempo
|
||||
explicit:
|
||||
buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]
|
||||
dimensions:
|
||||
- name: xrpl.rpc.command
|
||||
- name: xrpl.rpc.status
|
||||
- name: command
|
||||
- name: rpc_status
|
||||
|
||||
exporters:
|
||||
prometheus:
|
||||
|
||||
@@ -50,7 +50,7 @@
|
||||
if (telemetry.isEnabled() && telemetry.shouldTraceRpc())
|
||||
{
|
||||
SpanGuard guard(telemetry.startSpan("rpc.command.submit"));
|
||||
guard.setAttribute("xrpl.rpc.command", "submit");
|
||||
guard.setAttribute("command", "submit");
|
||||
// ... guard ends span automatically on scope exit
|
||||
}
|
||||
@endcode
|
||||
|
||||
Reference in New Issue
Block a user