diff --git a/OpenTelemetryPlan/01-architecture-analysis.md b/OpenTelemetryPlan/01-architecture-analysis.md index c62ac3454c..fde1833349 100644 --- a/OpenTelemetryPlan/01-architecture-analysis.md +++ b/OpenTelemetryPlan/01-architecture-analysis.md @@ -247,14 +247,14 @@ flowchart TB subgraph request["rpc.request (root span)"] http["HTTP Request — POST /
traceparent:
00-abc123...-def456...-01"] - attrs["Attributes:
http.method = POST
net.peer.ip = 192.168.1.100
xrpl.rpc.command = submit"] + attrs["Attributes:
http.method = POST
net.peer.ip = 192.168.1.100
command = submit"] subgraph enqueue["jobqueue.enqueue"] job_attr["xrpl.job.type = jtCLIENT_RPC"] end subgraph command["rpc.command.submit"] - cmd_attrs["xrpl.rpc.version = 2
xrpl.rpc.role = user"] + cmd_attrs["version = 2
rpc_role = user"] cmd_children["├── tx.deserialize
├── tx.validate_local
└── tx.submit_to_network"] end @@ -359,7 +359,7 @@ After implementing OpenTelemetry, operators and developers will gain visibility | **Transaction Lifecycle** | Full journey from RPC submission through validation, relay, consensus, and ledger inclusion | `{service.name="xrpld" && xrpl.tx.hash="ABC123..."}` | | **Cross-Node Propagation** | Transaction path across multiple xrpld nodes with timing | `{xrpl.tx.relay_count > 0}` | | **Consensus Rounds** | Complete round with all phases (open, establish, accept) | `{span.name=~"consensus.round.*"}` | -| **RPC Request Processing** | Individual command execution with timing breakdown | `{xrpl.rpc.command="account_info"}` | +| **RPC Request Processing** | Individual command execution with timing breakdown | `{command="account_info"}` | | **Ledger Acquisition** | Peer-to-peer ledger data requests and responses | `{span.name="ledger.acquire"}` | | **PathFinding Latency** | Path computation time and cache effectiveness for payment RPCs | `{span.name="pathfind.compute"}` | | **TxQ Behavior** | Queue depth, eviction patterns, fee escalation during congestion | `{span.name=~"txq.*"}` | @@ -458,7 +458,7 @@ xychart-beta 1. **Find Transaction**: Query by `xrpl.tx.hash` to get full trace 2. **Identify Bottleneck**: Look at span durations to find slowest component -3. **Check Attributes**: Review `xrpl.tx.validity`, `xrpl.rpc.status` for errors +3. **Check Attributes**: Review `xrpl.tx.validity`, `rpc_status` for errors 4. **Correlate Logs**: Use `trace_id` to find related PerfLog entries 5. **Compare Nodes**: Filter by `service.instance.id` to compare behavior across nodes diff --git a/OpenTelemetryPlan/02-design-decisions.md b/OpenTelemetryPlan/02-design-decisions.md index 5d68278629..681381ace5 100644 --- a/OpenTelemetryPlan/02-design-decisions.md +++ b/OpenTelemetryPlan/02-design-decisions.md @@ -260,10 +260,10 @@ resource::SemanticConventions::SERVICE_INSTANCE_ID = #### RPC Attributes ```cpp -"xrpl.rpc.command" = string // Command name -"xrpl.rpc.version" = int64 // API version -"xrpl.rpc.role" = string // "admin" or "user" -"xrpl.rpc.params" = string // Sanitized parameters (optional) +"command" = string // Command name +"version" = int64 // API version +"rpc_role" = string // "admin" or "user" +"xrpl.rpc.params" = string // Sanitized parameters (optional, planned) ``` #### Peer & Message Attributes @@ -293,10 +293,10 @@ resource::SemanticConventions::SERVICE_INSTANCE_ID = #### PathFinding Attributes ```cpp -"xrpl.pathfind.source_currency" = string // Source currency code -"xrpl.pathfind.dest_currency" = string // Destination currency code -"xrpl.pathfind.path_count" = int64 // Number of paths found -"xrpl.pathfind.cache_hit" = bool // RippleLineCache hit +"source_currency" = string // Source currency code (planned, not yet implemented) +"dest_currency" = string // Destination currency code (planned, not yet implemented) +"path_count" = int64 // Number of paths found (planned, not yet implemented) +"cache_hit" = bool // RippleLineCache hit (planned, not yet implemented) ``` #### TxQ Attributes diff --git a/OpenTelemetryPlan/03-implementation-strategy.md b/OpenTelemetryPlan/03-implementation-strategy.md index 9a4baf7131..61e522719b 100644 --- a/OpenTelemetryPlan/03-implementation-strategy.md +++ b/OpenTelemetryPlan/03-implementation-strategy.md @@ -490,11 +490,11 @@ void ServerHandler::onRequest(...) { // After (only ~4 lines added) void ServerHandler::onRequest(...) { auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); // +1 line - span.setAttribute("xrpl.rpc.command", command); // +1 line + span.setAttribute("command", command); // +1 line auto result = processRequest(req); - span.setAttribute("xrpl.rpc.status", status); // +1 line + span.setAttribute("rpc_status", status); // +1 line send(result); } ``` diff --git a/OpenTelemetryPlan/04-code-samples.md b/OpenTelemetryPlan/04-code-samples.md index 9a637c0c05..d4d5c0bdc0 100644 --- a/OpenTelemetryPlan/04-code-samples.md +++ b/OpenTelemetryPlan/04-code-samples.md @@ -346,11 +346,11 @@ void ServerHandler::onRequest(...) // Factory creates a span if RPC tracing is enabled, no-op otherwise. // No Telemetry& reference needed -- accessed via global singleton. auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); - span.setAttribute("xrpl.rpc.command", command); + span.setAttribute("command", command); auto result = processRequest(req); - span.setAttribute("xrpl.rpc.status", result.status()); + span.setAttribute("rpc_status", result.status()); span.setOk(); // span ended automatically when it goes out of scope } @@ -841,7 +841,7 @@ ServerHandler::onRequest( ? jv["method"].asString() : "unknown"; - span.setAttribute("xrpl.rpc.command", command); + span.setAttribute("command", command); // Create child span for command execution { @@ -854,7 +854,7 @@ ServerHandler::onRequest( // Record result attributes if (result.isMember("status")) { - cmdSpan.setAttribute("xrpl.rpc.status", + cmdSpan.setAttribute("rpc_status", result["status"].asString()); } diff --git a/OpenTelemetryPlan/05-configuration-reference.md b/OpenTelemetryPlan/05-configuration-reference.md index bdb0b0bb22..70df0f5b95 100644 --- a/OpenTelemetryPlan/05-configuration-reference.md +++ b/OpenTelemetryPlan/05-configuration-reference.md @@ -490,7 +490,7 @@ processors: - name: rpc-spans type: string_attribute string_attribute: - key: xrpl.rpc.command + key: command values: [".*"] enabled_regex_matching: true - name: latency @@ -748,7 +748,7 @@ providers: "targets": [ { "queryType": "traceql", - "query": "{resource.service.name=\"xrpld\" && span.xrpl.rpc.command != \"\"} | histogram_over_time(duration) by (span.xrpl.rpc.command)" + "query": "{resource.service.name=\"xrpld\" && span.command != \"\"} | histogram_over_time(duration) by (span.command)" } ], "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 } @@ -760,7 +760,7 @@ providers: "targets": [ { "queryType": "traceql", - "query": "{resource.service.name=\"xrpld\" && status.code=error} | rate() by (span.xrpl.rpc.command)" + "query": "{resource.service.name=\"xrpld\" && status.code=error} | rate() by (span.command)" } ], "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 } @@ -772,7 +772,7 @@ providers: "targets": [ { "queryType": "traceql", - "query": "{resource.service.name=\"xrpld\" && span.xrpl.rpc.command != \"\"} | avg(duration) by (span.xrpl.rpc.command) | topk(10)" + "query": "{resource.service.name=\"xrpld\" && span.command != \"\"} | avg(duration) by (span.command) | topk(10)" } ], "gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 } diff --git a/OpenTelemetryPlan/POC_taskList.md b/OpenTelemetryPlan/POC_taskList.md index 8cd390ef5b..5f93886200 100644 --- a/OpenTelemetryPlan/POC_taskList.md +++ b/OpenTelemetryPlan/POC_taskList.md @@ -302,8 +302,8 @@ // Each factory checks the global Telemetry instance internally. // No Telemetry& reference needed at the call site. auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); - span.setAttribute("xrpl.rpc.command", command); - span.setAttribute("xrpl.rpc.status", status); + span.setAttribute("command", command); + span.setAttribute("rpc_status", status); ``` - Factory methods: `rpcSpan()`, `txSpan()`, `consensusSpan()`, `peerSpan()`, `ledgerSpan()`, `span()` @@ -336,12 +336,12 @@ - `#include ` - In `ServerHandler::onRequest(Session& session)`: - At the top of the method, add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.request");` - - After the RPC command name is extracted, set attribute: `span.setAttribute("xrpl.rpc.command", command);` + - After the RPC command name is extracted, set attribute: `span.setAttribute("command", command);` - After the response status is known, set: `span.setAttribute("http.status_code", static_cast(statusCode));` - Wrap error paths with: `span.recordException(e);` - In `ServerHandler::processRequest(...)`: - Add a child span: `auto span = telemetry::SpanGuard::rpcSpan("rpc.process");` - - Set method attribute: `span.setAttribute("xrpl.rpc.method", request_method);` + - Set method attribute: `span.setAttribute("method", request_method);` - In `ServerHandler::onWSMessage(...)` (WebSocket path): - Add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.ws.message");` @@ -362,7 +362,7 @@ - [01-architecture-analysis.md §1.5](./01-architecture-analysis.md) — RPC request flow diagram: HTTP request -> attributes -> jobqueue.enqueue -> rpc.command -> response - [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.request` in `ServerHandler.cpp::onRequest()` (Priority: High) - [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming convention: `rpc.request`, `rpc.command.*` -- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.params` +- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `command`, `version`, `rpc_role`, `xrpl.rpc.params` - [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact: `ServerHandler.cpp` ~40 lines added, ~10 changed (Low risk) --- @@ -378,17 +378,17 @@ - In `doCommand(RPC::JsonContext& context, Json::Value& result)`: - At the top: `auto span = telemetry::SpanGuard::rpcSpan("rpc.command." + context.method);` - Set attributes: - - `span.setAttribute("xrpl.rpc.command", context.method);` - - `span.setAttribute("xrpl.rpc.version", static_cast(context.apiVersion));` - - `span.setAttribute("xrpl.rpc.role", (context.role == Role::ADMIN) ? "admin" : "user");` - - On success: `span.setAttribute("xrpl.rpc.status", "success");` - - On error: `span.setAttribute("xrpl.rpc.status", "error");` and set the error message + - `span.setAttribute("command", context.method);` + - `span.setAttribute("version", static_cast(context.apiVersion));` + - `span.setAttribute("rpc_role", (context.role == Role::ADMIN) ? "admin" : "user");` + - On success: `span.setAttribute("rpc_status", "success");` + - On error: `span.setAttribute("rpc_status", "error");` and set the error message - After this, traces in Tempo/Grafana should look like: ``` - rpc.request (xrpl.rpc.command=account_info) + rpc.request (command=account_info) └── rpc.process - └── rpc.command.account_info (xrpl.rpc.version=2, xrpl.rpc.role=user, xrpl.rpc.status=success) + └── rpc.command.account_info (version=2, rpc_role=user, rpc_status=success) ``` **Key modified file**: @@ -399,7 +399,7 @@ - [04-code-samples.md §4.5.3](./04-code-samples.md) — `ServerHandler::onRequest()` code sample (includes child span pattern for `rpc.command.*`) - [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming: `rpc.command.*` pattern with dynamic command name (e.g., `rpc.command.server_info`) -- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.status` +- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `command`, `version`, `rpc_role`, `rpc_status` - [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.command.*` in `RPCHandler.cpp::doCommand()` (Priority: High) - [02-design-decisions.md §2.6.5](./02-design-decisions.md) — Correlation with PerfLog: how `doCommand()` can link trace_id with existing PerfLog entries - [03-implementation-strategy.md §3.4.4](./03-implementation-strategy.md) — RPC request overhead budget: ~1.75 μs total per request @@ -472,7 +472,7 @@ - Navigate to Explore → select Tempo datasource - Search for service `xrpld` - Confirm you see traces with spans: `rpc.request` -> `rpc.process` -> `rpc.command.server_info` - - Click into a trace and verify attributes: `xrpl.rpc.command`, `xrpl.rpc.status`, `xrpl.rpc.version` + - Click into a trace and verify attributes: `command`, `rpc_status`, `version` 7. **Verify zero-overhead when disabled**: - Rebuild with `XRPL_ENABLE_TELEMETRY=OFF`, or set `enabled=0` in config @@ -486,7 +486,7 @@ - [ ] xrpld starts and connects to OTel Collector (check xrpld logs for telemetry messages) - [ ] Traces appear in Grafana/Tempo under service "xrpld" - [ ] Span hierarchy is correct (parent-child relationships) -- [ ] Span attributes are populated (`xrpl.rpc.command`, `xrpl.rpc.status`, etc.) +- [ ] Span attributes are populated (`command`, `rpc_status`, etc.) - [ ] Error spans show error status and message - [ ] Building with `XRPL_ENABLE_TELEMETRY=OFF` produces no regressions - [ ] Setting `enabled=0` at runtime produces no traces and no errors @@ -572,8 +572,8 @@ The current POC exports **traces only**. Grafana's Explore view can query Tempo explicit: buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s] dimensions: - - name: xrpl.rpc.command - - name: xrpl.rpc.status + - name: command + - name: rpc_status exporters: prometheus: diff --git a/OpenTelemetryPlan/Phase2_taskList.md b/OpenTelemetryPlan/Phase2_taskList.md index 249be880ff..6979f78869 100644 --- a/OpenTelemetryPlan/Phase2_taskList.md +++ b/OpenTelemetryPlan/Phase2_taskList.md @@ -91,7 +91,7 @@ - `http.method` is always POST for JSON-RPC - `net.peer.ip` is debug-level info available in logs -- `xrpl.rpc.duration_ms` is redundant with span duration (OTel captures start/end time natively) +- `duration_ms` is redundant with span duration (OTel captures start/end time natively) These can be added later if dashboard queries specifically need them. The node health attributes (Task 2.8) provide far more operational value and were prioritized instead. @@ -130,9 +130,8 @@ These can be added later if dashboard queries specifically need them. The node h **What to do**: - Edit `src/xrpld/rpc/detail/RPCHandler.cpp`: - - In the `rpc.command.*` span creation block (after existing `setAttribute` calls for `xrpl.rpc.command`, `xrpl.rpc.version`, etc.): - - Add `xrpl.node.amendment_blocked` (bool) — from `context.app.getOPs().isAmendmentBlocked()` - - Add `xrpl.node.server_state` (string) — from `context.app.getOPs().strOperatingMode()` + - In the `rpc.command.*` span creation block (after existing `setAttribute` calls for `command`, `version`, etc.): + - Node health attrs (`xrpl.node.amendment_blocked`, `xrpl.node.server_state`) are now resource-level attrs, not per-span. They are set at Tracer init. **New span attributes**: diff --git a/OpenTelemetryPlan/Phase3_taskList.md b/OpenTelemetryPlan/Phase3_taskList.md index 18146dff02..9c5127120f 100644 --- a/OpenTelemetryPlan/Phase3_taskList.md +++ b/OpenTelemetryPlan/Phase3_taskList.md @@ -89,13 +89,13 @@ - In `onMessage(TMTransaction)` / `handleTransaction()`: - Extract parent trace context from incoming `TMTransaction::trace_context` field (if present) - Create `tx.receive` span as child of extracted context (or new root if none) - - Set attributes: `xrpl.tx.hash`, `xrpl.peer.id`, `xrpl.tx.status` - - On HashRouter suppression (duplicate): set `xrpl.tx.suppressed=true`, add `tx.duplicate` event + - Set attributes: `xrpl.tx.hash`, `xrpl.peer.id`, `tx_status` + - On HashRouter suppression (duplicate): set `suppressed=true`, add `tx.duplicate` event - Wrap validation call with child span `tx.validate` - Wrap relay with `tx.relay` span - When relaying to peers: - Inject current trace context into outgoing `TMTransaction::trace_context` - - Set `xrpl.tx.relay_count` attribute + - Set `relay_count` attribute - Use `SpanGuard::span(TraceCategory::Transactions, "tx", "receive")` factory (Phase 1c replaced macros with the SpanGuard factory pattern) @@ -121,7 +121,7 @@ - Edit `src/xrpld/app/misc/NetworkOPs.cpp`: - In `processTransaction()`: - Create `tx.process` span - - Set attributes: `xrpl.tx.hash`, `xrpl.tx.type`, `xrpl.tx.local` (whether from RPC or peer) + - Set attributes: `xrpl.tx.hash`, `tx_type`, `local` (whether from RPC or peer) - Record whether sync or async path is taken - In `doTransactionAsync()`: @@ -152,8 +152,8 @@ - Edit `src/xrpld/overlay/detail/PeerImp.cpp` (in handleTransaction): - After calling `HashRouter::shouldProcess()` or `addSuppressionPeer()`: - - Record `xrpl.tx.suppressed` attribute (true/false) - - Record `xrpl.tx.flags` showing current HashRouter state (SAVED, TRUSTED, etc.) + - Record `suppressed` attribute (true/false) + - Record `tx_flags` showing current HashRouter state (SAVED, TRUSTED, etc.) - Add `tx.first_seen` or `tx.duplicate` event - This is NOT a modification to HashRouter itself — just recording its decisions as span attributes in the existing PeerImp instrumentation from Task 3.3. @@ -257,14 +257,14 @@ - Edit `src/xrpld/overlay/detail/PeerImp.cpp`: - In the `tx.receive` span block (after existing `xrpl.peer.id` setAttribute call): - - Add `xrpl.peer.version` (string) — from `this->getVersion()` + - Add `peer_version` (string) — from `this->getVersion()` - Only set if `getVersion()` returns a non-empty string (avoid empty-string attributes) **New span attribute**: -| Attribute | Type | Source | Example | -| ------------------- | ------ | -------------------- | --------------- | -| `xrpl.peer.version` | string | `peer->getVersion()` | `"xrpld-2.4.0"` | +| Attribute | Type | Source | Example | +| -------------- | ------ | -------------------- | --------------- | +| `peer_version` | string | `peer->getVersion()` | `"xrpld-2.4.0"` | **Rationale**: Transaction relay is where version mismatches cause subtle serialization or validation bugs. Tracing "this tx came from a v2.3.0 peer" helps diagnose compatibility issues. The community dashboard tracks peer versions externally; this brings version awareness into the trace itself. @@ -274,7 +274,7 @@ **Exit Criteria**: -- [ ] `tx.receive` spans carry `xrpl.peer.version` attribute with a non-empty version string +- [ ] `tx.receive` spans carry `peer_version` attribute with a non-empty version string - [ ] Attribute is omitted (not set to empty string) when `getVersion()` returns empty - [ ] Attribute visible in Jaeger span detail view @@ -387,8 +387,8 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ - No protobuf context to extract here (NetworkOPs is intra-node), so deterministic context alone is sufficient. -- Add `tx_trace_strategy` attribute to spans: - - Add `inline constexpr auto traceStrategy = join(xrplTx, makeStr("trace_strategy"));` +- Add `trace_strategy` attribute to spans: + - Add `inline constexpr auto traceStrategy = "trace_strategy";` to `TxSpanNames.h`. - Set on each tx span: `span.setAttribute(tx_span::attr::traceStrategy, "deterministic")`. @@ -419,7 +419,7 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ - [ ] All nodes handling the same transaction produce spans under the same trace_id - [x] Protobuf `span_id` propagation still works when available (parent-child ordering) - [ ] Missing protobuf context (old peer) degrades gracefully to sibling spans, not lost traces -- [ ] `xrpl.tx.trace_strategy` attribute set to `"deterministic"` on all tx spans +- [ ] `trace_strategy` attribute set to `"deterministic"` on all tx spans - [ ] Trace queryable by tx hash (truncate hash → trace_id → direct lookup in Tempo) **Deliverables implemented (not in original plan)**: diff --git a/OpenTelemetryPlan/Phase4_taskList.md b/OpenTelemetryPlan/Phase4_taskList.md index 1670e9b57e..6d084c5934 100644 --- a/OpenTelemetryPlan/Phase4_taskList.md +++ b/OpenTelemetryPlan/Phase4_taskList.md @@ -27,8 +27,8 @@ - `RCLConsensus::Adaptor::startRoundTracing()` creates `consensus.round` span via `SpanGuard::hashSpan()` (deterministic) or `SpanGuard::span()` (attribute strategy) -- Attributes set: `xrpl.consensus.ledger_id`, `xrpl.consensus.ledger.seq`, - `xrpl.consensus.mode`, `xrpl.consensus.trace_strategy`, `xrpl.consensus.round_id` +- Attributes set: `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`, + `xrpl.consensus.mode`, `trace_strategy`, `xrpl.consensus.round_id` - Round span stored as `roundSpan_` member in `RCLConsensus::Adaptor` - `roundSpanContext_` snapshot captured for cross-thread span linking @@ -57,9 +57,9 @@ **Design notes**: -- `xrpl.consensus.phase` attribute — phases are distinguished by span names instead +- `phase` attribute — phases are distinguished by span names instead - `phase.enter` / `phase.exit` events — not added (span start/end serves this purpose) -- `xrpl.consensus.phase_duration_ms` attribute — not set (span duration captures this) +- `phase_duration_ms` attribute — not set (span duration captures this) **Key modified files**: @@ -82,11 +82,11 @@ - In `Adaptor::propose()`: - Creates `consensus.proposal.send` span via `SpanGuard::span()` - - Sets `xrpl.consensus.round` attribute + - Sets `xrpl.consensus.round` attribute (kept — rule 5) - In `PeerImp::onMessage(TMProposeSet)`: - Creates `consensus.proposal.receive` span - - Sets `xrpl.consensus.proposal.trusted` attribute (bool) + - Sets `trusted` attribute (bool) **Not implemented** (deferred to Phase 4b — cross-node propagation): @@ -117,12 +117,12 @@ - Uses `SpanGuard::linkedSpan()` to create a follows-from link to the round span - Thread-safe: uses `roundSpanContext_` snapshot (captured on consensus thread, read on jtACCEPT thread) - - Sets `xrpl.consensus.ledger.seq` and `xrpl.consensus.proposing` attributes + - Sets `xrpl.ledger.seq` and `proposing` attributes - In `PeerImp::onMessage(TMValidation)`: - Creates `consensus.validation.receive` span - - Sets `xrpl.consensus.validation.trusted` attribute (bool) - - Sets `xrpl.consensus.validation.ledger_seq` attribute + - Sets `trusted` attribute (bool) + - Sets `xrpl.ledger.seq` attribute **Not implemented** (deferred to Phase 4b — cross-node propagation): @@ -142,18 +142,18 @@ **Implemented attributes** (across various spans): -- `xrpl.consensus.ledger.seq` — on `consensus.round`, `consensus.accept.apply` +- `xrpl.ledger.seq` — on `consensus.round`, `consensus.accept.apply` - `xrpl.consensus.round` — on `consensus.proposal.send` - `xrpl.consensus.mode` — on `consensus.round`, `consensus.ledger_close` -- `xrpl.consensus.proposers` — on `consensus.accept`, `consensus.establish`, `consensus.update_positions` -- `xrpl.consensus.converge_percent` — on `consensus.establish`, `consensus.update_positions`, `consensus.check` -- `xrpl.consensus.tx_count` — on `consensus.accept.apply` span (in `doAccept()`) -- `xrpl.consensus.disputes_count` — on `consensus.update_positions` span (in `updateOurPositions()`) +- `proposers` — on `consensus.accept`, `consensus.establish`, `consensus.update_positions` +- `converge_percent` — on `consensus.establish`, `consensus.update_positions`, `consensus.check` +- `tx_count` — on `consensus.accept.apply` span (in `doAccept()`) +- `disputes_count` — on `consensus.update_positions` span (in `updateOurPositions()`) **Design notes**: -- `xrpl.consensus.phase` — phases distinguished by span names instead -- `xrpl.consensus.phase_duration_ms` — span duration captures this +- `phase` — phases distinguished by span names instead +- `phase_duration_ms` — span duration captures this **Key modified files**: @@ -221,8 +221,8 @@ - Add `xrpl.validation.ledger_hash` (string) — the ledger hash being validated - Add `xrpl.validation.full` (bool) — whether this is a full validation (not partial) - On the `consensus.accept` span (in `onAccept()`): - - Add `xrpl.consensus.validation_quorum` (int64) — from `app_.validators().quorum()` - - Add `xrpl.consensus.proposers_validated` (int64) — from `result.proposers` + - Add `validation_quorum` (int64) — from `app_.validators().quorum()` + - Add `proposers_validated` (int64) — from `result.proposers` - Edit `src/xrpld/overlay/detail/PeerImp.cpp`: - On the `peer.validation.receive` span: @@ -231,14 +231,14 @@ **New span attributes**: -| Span | Attribute | Type | Source | -| --------------------------- | ------------------------------------ | ------ | --------------------------------- | -| `consensus.validation.send` | `xrpl.validation.ledger_hash` | string | Ledger hash from validate() args | -| `consensus.validation.send` | `xrpl.validation.full` | bool | Full vs partial validation | -| `peer.validation.receive` | `xrpl.peer.validation.ledger_hash` | string | From STValidation deserialization | -| `peer.validation.receive` | `xrpl.peer.validation.full` | bool | From STValidation flags | -| `consensus.accept` | `xrpl.consensus.validation_quorum` | int64 | `app_.validators().quorum()` | -| `consensus.accept` | `xrpl.consensus.proposers_validated` | int64 | `result.proposers` | +| Span | Attribute | Type | Source | +| --------------------------- | ---------------------------------- | ------ | --------------------------------- | +| `consensus.validation.send` | `xrpl.validation.ledger_hash` | string | Ledger hash from validate() args | +| `consensus.validation.send` | `xrpl.validation.full` | bool | Full vs partial validation | +| `peer.validation.receive` | `xrpl.peer.validation.ledger_hash` | string | From STValidation deserialization | +| `peer.validation.receive` | `xrpl.peer.validation.full` | bool | From STValidation flags | +| `consensus.accept` | `validation_quorum` | int64 | `app_.validators().quorum()` | +| `consensus.accept` | `proposers_validated` | int64 | `result.proposers` | **Rationale**: The external dashboard's most valuable feature is validation agreement tracking. By recording the ledger hash on both outgoing and incoming validation spans, we create the raw data for agreement analysis at the trace level. Example Tempo query: @@ -257,7 +257,7 @@ Phase 7's `ValidationTracker` builds metric-level aggregation (1h/24h agreement - [ ] `consensus.validation.send` spans carry `xrpl.validation.ledger_hash` and `xrpl.validation.full` - [ ] `peer.validation.receive` spans carry `xrpl.peer.validation.ledger_hash` and `xrpl.peer.validation.full` -- [ ] `consensus.accept` spans carry `xrpl.consensus.validation_quorum` and `xrpl.consensus.proposers_validated` +- [ ] `consensus.accept` spans carry `validation_quorum` and `proposers_validated` - [ ] Ledger hash attributes match between send and receive for the same ledger - [ ] No impact on consensus performance @@ -283,26 +283,26 @@ Phase 7's `ValidationTracker` builds metric-level aggregation (1h/24h agreement | Span Name | Method | Key Attributes | | --------------------------- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `consensus.proposal.send` | `Adaptor::propose` | `xrpl.consensus.round` | -| `consensus.ledger_close` | `Adaptor::onClose` | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | -| `consensus.accept` | `Adaptor::onAccept` | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | -| `consensus.accept.apply` | `Adaptor::doAccept` | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state`, `proposing`, `round_time_ms`, `ledger.seq`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction` | -| `consensus.validation.send` | `Adaptor::onAccept` (via validate) | `xrpl.consensus.proposing` | +| `consensus.ledger_close` | `Adaptor::onClose` | `xrpl.ledger.seq`, `xrpl.consensus.mode` | +| `consensus.accept` | `Adaptor::onAccept` | `proposers`, `round_time_ms` | +| `consensus.accept.apply` | `Adaptor::doAccept` | `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `xrpl.ledger.seq`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction` | +| `consensus.validation.send` | `Adaptor::onAccept` (via validate) | `proposing` | #### Close Time Attributes (consensus.accept.apply) The `consensus.accept.apply` span captures ledger close time agreement details driven by `avCT_CONSENSUS_PCT` (75% validator agreement threshold): -- **`xrpl.consensus.close_time`** — Agreed-upon ledger close time (epoch seconds). When validators disagree (`consensusCloseTime == epoch`), this is synthetically set to `prevCloseTime + 1s`. -- **`xrpl.consensus.close_time_correct`** — `true` if validators reached agreement, `false` if they "agreed to disagree" (close time forced to prev+1s). -- **`xrpl.consensus.close_resolution_ms`** — Rounding granularity for close time (starts at 30s, decreases as ledger interval stabilizes). -- **`xrpl.consensus.state`** — `"finished"` (normal) or `"moved_on"` (consensus failed, adopted best available). -- **`xrpl.consensus.proposing`** — Whether this node was proposing. -- **`xrpl.consensus.round_time_ms`** — Total consensus round duration. -- **`xrpl.consensus.parent_close_time`** — Previous ledger's close time (epoch seconds). Enables computing close-time deltas across consecutive rounds without correlating separate spans. -- **`xrpl.consensus.close_time_self`** — This node's own proposed close time before consensus voting. -- **`xrpl.consensus.close_time_vote_bins`** — Number of distinct close-time vote bins from peer proposals. Higher values indicate less agreement among validators. -- **`xrpl.consensus.resolution_direction`** — Whether close-time resolution `"increased"` (coarser), `"decreased"` (finer), or stayed `"unchanged"` relative to the previous ledger. +- **`close_time`** — Agreed-upon ledger close time (epoch seconds). When validators disagree (`consensusCloseTime == epoch`), this is synthetically set to `prevCloseTime + 1s`. +- **`close_time_correct`** — `true` if validators reached agreement, `false` if they "agreed to disagree" (close time forced to prev+1s). +- **`close_resolution_ms`** — Rounding granularity for close time (starts at 30s, decreases as ledger interval stabilizes). +- **`consensus_state`** — `"finished"` (normal) or `"moved_on"` (consensus failed, adopted best available). +- **`proposing`** — Whether this node was proposing. +- **`round_time_ms`** — Total consensus round duration. +- **`parent_close_time`** — Previous ledger's close time (epoch seconds). Enables computing close-time deltas across consecutive rounds without correlating separate spans. +- **`close_time_self`** — This node's own proposed close time before consensus voting. +- **`close_time_vote_bins`** — Number of distinct close-time vote bins from peer proposals. Higher values indicate less agreement among validators. +- **`resolution_direction`** — Whether close-time resolution `"increased"` (coarser), `"decreased"` (finer), or stayed `"unchanged"` relative to the previous ledger. **Exit Criteria** (from [06-implementation-phases.md §6.11.4](./06-implementation-phases.md)): @@ -504,7 +504,7 @@ spans in `Consensus.h`. - Reads `consensus_trace_strategy` via `app_.getTelemetry().getConsensusTraceStrategy()` - **Deterministic**: uses `SpanGuard::hashSpan()` with `prevLgr.id()` data - **Attribute**: uses `SpanGuard::span(TraceCategory::Consensus, seg::consensus, "round")` - - Sets attributes: `ledger_id`, `ledger.seq`, `mode`, `trace_strategy`, `round_id` + - Sets attributes: `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`, `xrpl.consensus.mode`, `trace_strategy`, `xrpl.consensus.round_id` - Captures `roundSpanContext_` snapshot for cross-thread span linking - Saves `prevRoundContext_` from previous round for follows-from links @@ -585,9 +585,9 @@ with attributes for convergence progress. `SpanGuard::span()` returns a no-op guard when telemetry is disabled. - `updateEstablishTracing()` — sets attributes on each `phaseEstablish()` call: - - `xrpl.consensus.converge_percent` — `convergePercent_` - - `xrpl.consensus.establish_count` — `establishCounter_` - - `xrpl.consensus.proposers` — `currPeerPositions_.size()` + - `converge_percent` — `convergePercent_` + - `establish_count` — `establishCounter_` + - `proposers` — `currPeerPositions_.size()` - `endEstablishTracing()` — calls `establishSpan_.reset()` on phase exit. @@ -614,11 +614,11 @@ details. ``` - Attributes set: - - `xrpl.consensus.converge_percent` — current convergence - - `xrpl.consensus.proposers` — `currPeerPositions_.size()` - - `xrpl.consensus.have_close_time_consensus` — close time consensus state - - `xrpl.consensus.close_time_threshold` — `avCT_CONSENSUS_PCT` - - `xrpl.consensus.disputes_count` — number of active disputes + - `converge_percent` — current convergence + - `proposers` — `currPeerPositions_.size()` + - `have_close_time_consensus` — close time consensus state + - `close_time_threshold` — `avCT_CONSENSUS_PCT` + - `disputes_count` — number of active disputes - Dispute events recorded via direct `span.addEvent()` call with yays/nays: ```cpp @@ -632,7 +632,7 @@ details. **Not implemented**: -- `xrpl.consensus.proposers_agreed` / `xrpl.consensus.proposers_total` attributes — not set +- `proposers_agreed` / `proposers_total` attributes — not set **Key modified files**: @@ -658,13 +658,13 @@ including the avalanche threshold. ``` - Attributes set: - - `xrpl.consensus.agree_count` — peers that agree with our position - - `xrpl.consensus.disagree_count` — peers that disagree - - `xrpl.consensus.converge_percent` — convergence percentage - - `xrpl.consensus.have_close_time_consensus` — close time consensus state - - `xrpl.consensus.threshold_percent` — set to `avCT_CONSENSUS_PCT` (75%) - - `xrpl.consensus.result` — "yes", "no", or "moved_on" - - `xrpl.consensus.avalanche_threshold` — the escalated weight from `getNeededWeight()` on the `consensus.update_positions` span + - `agree_count` — peers that agree with our position + - `disagree_count` — peers that disagree + - `converge_percent` — convergence percentage + - `have_close_time_consensus` — close time consensus state + - `threshold_percent` — set to `avCT_CONSENSUS_PCT` (75%) + - `consensus_result` — "yes", "no", or "moved_on" + - `avalanche_threshold` — the escalated weight from `getNeededWeight()` on the `consensus.update_positions` span **Key modified files**: @@ -687,8 +687,8 @@ wrongLedger, switchedLedger). ```cpp auto span = telemetry::SpanGuard::span( telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "mode_change"); - span.setAttribute(cons_span::attr::modeOld, to_string(before).c_str()); - span.setAttribute(cons_span::attr::modeNew, to_string(after).c_str()); + span.setAttribute(cons_span::attr::modeOld, to_string(before).c_str()); // "mode_old" + span.setAttribute(cons_span::attr::modeNew, to_string(after).c_str()); // "mode_new" ``` - `MonitoredMode::set()` in `Consensus.h` calls `adaptor_.onModeChange(before, after)`. @@ -773,48 +773,48 @@ and OFF, and don't affect consensus timing. | Span Name | Location | Key Attributes (actually set) | | ---------------------------- | ------------------ | ----------------------------------------------------------------------------------------------------------------------------- | -| `consensus.round` | `RCLConsensus.cpp` | `round_id`, `ledger_id`, `ledger.seq`, `mode`, `trace_strategy` | +| `consensus.round` | `RCLConsensus.cpp` | `xrpl.consensus.round_id`, `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`, `xrpl.consensus.mode`, `trace_strategy` | | `consensus.establish` | `Consensus.h` | `converge_percent`, `establish_count`, `proposers` | | `consensus.update_positions` | `Consensus.h` | `converge_percent`, `proposers`, `have_close_time_consensus`, `close_time_threshold`, `disputes_count`, `avalanche_threshold` | -| `consensus.check` | `Consensus.h` | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `result` | -| `consensus.mode_change` | `RCLConsensus.cpp` | `mode.old`, `mode.new` | +| `consensus.check` | `Consensus.h` | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `consensus_result` | +| `consensus.mode_change` | `RCLConsensus.cpp` | `mode_old`, `mode_new` | ### New Events (Phase 4a) -| Event Name | Parent Span | Attributes (actually set) | -| ----------------- | ---------------------------- | ----------------------------------- | -| `dispute.resolve` | `consensus.update_positions` | `tx_id`, `our_vote`, `yays`, `nays` | -| `tx.included` | `consensus.accept.apply` | `tx_id` | +| Event Name | Parent Span | Attributes (actually set) | +| ----------------- | ---------------------------- | ---------------------------------------------------------------- | +| `dispute.resolve` | `consensus.update_positions` | `xrpl.tx.id`, `dispute_our_vote`, `dispute_yays`, `dispute_nays` | +| `tx.included` | `consensus.accept.apply` | `xrpl.tx.id` | ### New Attributes (Phase 4a) ```cpp // Round-level (on consensus.round) — ALL IMPLEMENTED -"xrpl.consensus.round_id" = int64 // Consensus round number -"xrpl.consensus.ledger_id" = string // previousLedger.id() hash -"xrpl.consensus.trace_strategy" = string // "deterministic" or "attribute" +"xrpl.consensus.round_id" = int64 // Consensus round number (kept — rule 5) +"xrpl.consensus.ledger_id" = string // previousLedger.id() hash (kept — rule 5) +"trace_strategy" = string // "deterministic" or "attribute" // Establish-level — IMPLEMENTED -"xrpl.consensus.converge_percent" = int64 // Convergence % (0-100+) -"xrpl.consensus.establish_count" = int64 // Number of establish iterations -"xrpl.consensus.agree_count" = int64 // Peers that agree (haveConsensus) -"xrpl.consensus.disagree_count" = int64 // Peers that disagree -"xrpl.consensus.threshold_percent" = int64 // Current threshold (avCT_CONSENSUS_PCT = 75%) -"xrpl.consensus.result" = string // "yes", "no", "moved_on" -"xrpl.consensus.have_close_time_consensus" = bool // Close time consensus reached -"xrpl.consensus.close_time_threshold" = int64 // Close time voting threshold +"converge_percent" = int64 // Convergence % (0-100+) +"establish_count" = int64 // Number of establish iterations +"agree_count" = int64 // Peers that agree (haveConsensus) +"disagree_count" = int64 // Peers that disagree +"threshold_percent" = int64 // Current threshold (avCT_CONSENSUS_PCT = 75%) +"consensus_result" = string // "yes", "no", "moved_on" +"have_close_time_consensus" = bool // Close time consensus reached +"close_time_threshold" = int64 // Close time voting threshold // Establish-level — IMPLEMENTED -"xrpl.consensus.disputes_count" = int64 // Active disputes (on update_positions) -"xrpl.consensus.avalanche_threshold" = int64 // Escalated weight (on update_positions) +"disputes_count" = int64 // Active disputes (on update_positions) +"avalanche_threshold" = int64 // Escalated weight (on update_positions) // Establish-level — NOT IMPLEMENTED -// "xrpl.consensus.proposers_agreed" = int64 // Peers agreeing with us — not set -// "xrpl.consensus.proposers_total" = int64 // Total peer positions — not set (not defined) +// "proposers_agreed" = int64 // Peers agreeing with us — not set +// "proposers_total" = int64 // Total peer positions — not set (not defined) // Mode change — ALL IMPLEMENTED -"xrpl.consensus.mode.old" = string // Previous mode -"xrpl.consensus.mode.new" = string // New mode +"mode_old" = string // Previous mode +"mode_new" = string // New mode ``` ### Implementation Notes diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index 1b965345a9..46e3895fa1 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -50,7 +50,7 @@ if (telemetry.isEnabled() && telemetry.shouldTraceRpc()) { SpanGuard guard(telemetry.startSpan("rpc.command.submit")); - guard.setAttribute("xrpl.rpc.command", "submit"); + guard.setAttribute("command", "submit"); // ... guard ends span automatically on scope exit } @endcode