mirror of
https://github.com/XRPLF/rippled.git
synced 2026-06-03 08:46:46 +00:00
Merge branch 'pratik/otel-phase8-log-correlation' into pratik/otel-phase9-metric-gap-fill
This commit is contained in:
@@ -247,14 +247,14 @@ flowchart TB
|
|||||||
subgraph request["rpc.request (root span)"]
|
subgraph request["rpc.request (root span)"]
|
||||||
http["HTTP Request — POST /<br/>traceparent:<br/>00-abc123...-def456...-01"]
|
http["HTTP Request — POST /<br/>traceparent:<br/>00-abc123...-def456...-01"]
|
||||||
|
|
||||||
attrs["Attributes:<br/>http.method = POST<br/>net.peer.ip = 192.168.1.100<br/>xrpl.rpc.command = submit"]
|
attrs["Attributes:<br/>http.method = POST<br/>net.peer.ip = 192.168.1.100<br/>command = submit"]
|
||||||
|
|
||||||
subgraph enqueue["jobqueue.enqueue"]
|
subgraph enqueue["jobqueue.enqueue"]
|
||||||
job_attr["xrpl.job.type = jtCLIENT_RPC"]
|
job_attr["xrpl.job.type = jtCLIENT_RPC"]
|
||||||
end
|
end
|
||||||
|
|
||||||
subgraph command["rpc.command.submit"]
|
subgraph command["rpc.command.submit"]
|
||||||
cmd_attrs["xrpl.rpc.version = 2<br/>xrpl.rpc.role = user"]
|
cmd_attrs["version = 2<br/>rpc_role = user"]
|
||||||
cmd_children["├── tx.deserialize<br/>├── tx.validate_local<br/>└── tx.submit_to_network"]
|
cmd_children["├── tx.deserialize<br/>├── tx.validate_local<br/>└── tx.submit_to_network"]
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -359,7 +359,7 @@ After implementing OpenTelemetry, operators and developers will gain visibility
|
|||||||
| **Transaction Lifecycle** | Full journey from RPC submission through validation, relay, consensus, and ledger inclusion | `{service.name="xrpld" && xrpl.tx.hash="ABC123..."}` |
|
| **Transaction Lifecycle** | Full journey from RPC submission through validation, relay, consensus, and ledger inclusion | `{service.name="xrpld" && xrpl.tx.hash="ABC123..."}` |
|
||||||
| **Cross-Node Propagation** | Transaction path across multiple xrpld nodes with timing | `{xrpl.tx.relay_count > 0}` |
|
| **Cross-Node Propagation** | Transaction path across multiple xrpld nodes with timing | `{xrpl.tx.relay_count > 0}` |
|
||||||
| **Consensus Rounds** | Complete round with all phases (open, establish, accept) | `{span.name=~"consensus.round.*"}` |
|
| **Consensus Rounds** | Complete round with all phases (open, establish, accept) | `{span.name=~"consensus.round.*"}` |
|
||||||
| **RPC Request Processing** | Individual command execution with timing breakdown | `{xrpl.rpc.command="account_info"}` |
|
| **RPC Request Processing** | Individual command execution with timing breakdown | `{command="account_info"}` |
|
||||||
| **Ledger Acquisition** | Peer-to-peer ledger data requests and responses | `{span.name="ledger.acquire"}` |
|
| **Ledger Acquisition** | Peer-to-peer ledger data requests and responses | `{span.name="ledger.acquire"}` |
|
||||||
| **PathFinding Latency** | Path computation time and cache effectiveness for payment RPCs | `{span.name="pathfind.compute"}` |
|
| **PathFinding Latency** | Path computation time and cache effectiveness for payment RPCs | `{span.name="pathfind.compute"}` |
|
||||||
| **TxQ Behavior** | Queue depth, eviction patterns, fee escalation during congestion | `{span.name=~"txq.*"}` |
|
| **TxQ Behavior** | Queue depth, eviction patterns, fee escalation during congestion | `{span.name=~"txq.*"}` |
|
||||||
@@ -458,7 +458,7 @@ xychart-beta
|
|||||||
|
|
||||||
1. **Find Transaction**: Query by `xrpl.tx.hash` to get full trace
|
1. **Find Transaction**: Query by `xrpl.tx.hash` to get full trace
|
||||||
2. **Identify Bottleneck**: Look at span durations to find slowest component
|
2. **Identify Bottleneck**: Look at span durations to find slowest component
|
||||||
3. **Check Attributes**: Review `xrpl.tx.validity`, `xrpl.rpc.status` for errors
|
3. **Check Attributes**: Review `xrpl.tx.validity`, `rpc_status` for errors
|
||||||
4. **Correlate Logs**: Use `trace_id` to find related PerfLog entries
|
4. **Correlate Logs**: Use `trace_id` to find related PerfLog entries
|
||||||
5. **Compare Nodes**: Filter by `service.instance.id` to compare behavior across nodes
|
5. **Compare Nodes**: Filter by `service.instance.id` to compare behavior across nodes
|
||||||
|
|
||||||
|
|||||||
@@ -260,10 +260,10 @@ resource::SemanticConventions::SERVICE_INSTANCE_ID = <node_public_key_base58>
|
|||||||
#### RPC Attributes
|
#### RPC Attributes
|
||||||
|
|
||||||
```cpp
|
```cpp
|
||||||
"xrpl.rpc.command" = string // Command name
|
"command" = string // Command name
|
||||||
"xrpl.rpc.version" = int64 // API version
|
"version" = int64 // API version
|
||||||
"xrpl.rpc.role" = string // "admin" or "user"
|
"rpc_role" = string // "admin" or "user"
|
||||||
"xrpl.rpc.params" = string // Sanitized parameters (optional)
|
"xrpl.rpc.params" = string // Sanitized parameters (optional, planned)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Peer & Message Attributes
|
#### Peer & Message Attributes
|
||||||
@@ -293,10 +293,10 @@ resource::SemanticConventions::SERVICE_INSTANCE_ID = <node_public_key_base58>
|
|||||||
#### PathFinding Attributes
|
#### PathFinding Attributes
|
||||||
|
|
||||||
```cpp
|
```cpp
|
||||||
"xrpl.pathfind.source_currency" = string // Source currency code
|
"source_currency" = string // Source currency code (planned, not yet implemented)
|
||||||
"xrpl.pathfind.dest_currency" = string // Destination currency code
|
"dest_currency" = string // Destination currency code (planned, not yet implemented)
|
||||||
"xrpl.pathfind.path_count" = int64 // Number of paths found
|
"path_count" = int64 // Number of paths found (planned, not yet implemented)
|
||||||
"xrpl.pathfind.cache_hit" = bool // RippleLineCache hit
|
"cache_hit" = bool // RippleLineCache hit (planned, not yet implemented)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### TxQ Attributes
|
#### TxQ Attributes
|
||||||
|
|||||||
@@ -490,11 +490,11 @@ void ServerHandler::onRequest(...) {
|
|||||||
// After (only ~4 lines added)
|
// After (only ~4 lines added)
|
||||||
void ServerHandler::onRequest(...) {
|
void ServerHandler::onRequest(...) {
|
||||||
auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); // +1 line
|
auto span = telemetry::SpanGuard::rpcSpan("rpc.request"); // +1 line
|
||||||
span.setAttribute("xrpl.rpc.command", command); // +1 line
|
span.setAttribute("command", command); // +1 line
|
||||||
|
|
||||||
auto result = processRequest(req);
|
auto result = processRequest(req);
|
||||||
|
|
||||||
span.setAttribute("xrpl.rpc.status", status); // +1 line
|
span.setAttribute("rpc_status", status); // +1 line
|
||||||
send(result);
|
send(result);
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -346,11 +346,11 @@ void ServerHandler::onRequest(...)
|
|||||||
// Factory creates a span if RPC tracing is enabled, no-op otherwise.
|
// Factory creates a span if RPC tracing is enabled, no-op otherwise.
|
||||||
// No Telemetry& reference needed -- accessed via global singleton.
|
// No Telemetry& reference needed -- accessed via global singleton.
|
||||||
auto span = telemetry::SpanGuard::rpcSpan("rpc.request");
|
auto span = telemetry::SpanGuard::rpcSpan("rpc.request");
|
||||||
span.setAttribute("xrpl.rpc.command", command);
|
span.setAttribute("command", command);
|
||||||
|
|
||||||
auto result = processRequest(req);
|
auto result = processRequest(req);
|
||||||
|
|
||||||
span.setAttribute("xrpl.rpc.status", result.status());
|
span.setAttribute("rpc_status", result.status());
|
||||||
span.setOk();
|
span.setOk();
|
||||||
// span ended automatically when it goes out of scope
|
// span ended automatically when it goes out of scope
|
||||||
}
|
}
|
||||||
@@ -841,7 +841,7 @@ ServerHandler::onRequest(
|
|||||||
? jv["method"].asString()
|
? jv["method"].asString()
|
||||||
: "unknown";
|
: "unknown";
|
||||||
|
|
||||||
span.setAttribute("xrpl.rpc.command", command);
|
span.setAttribute("command", command);
|
||||||
|
|
||||||
// Create child span for command execution
|
// Create child span for command execution
|
||||||
{
|
{
|
||||||
@@ -854,7 +854,7 @@ ServerHandler::onRequest(
|
|||||||
// Record result attributes
|
// Record result attributes
|
||||||
if (result.isMember("status"))
|
if (result.isMember("status"))
|
||||||
{
|
{
|
||||||
cmdSpan.setAttribute("xrpl.rpc.status",
|
cmdSpan.setAttribute("rpc_status",
|
||||||
result["status"].asString());
|
result["status"].asString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -490,7 +490,7 @@ processors:
|
|||||||
- name: rpc-spans
|
- name: rpc-spans
|
||||||
type: string_attribute
|
type: string_attribute
|
||||||
string_attribute:
|
string_attribute:
|
||||||
key: xrpl.rpc.command
|
key: command
|
||||||
values: [".*"]
|
values: [".*"]
|
||||||
enabled_regex_matching: true
|
enabled_regex_matching: true
|
||||||
- name: latency
|
- name: latency
|
||||||
@@ -748,7 +748,7 @@ providers:
|
|||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"queryType": "traceql",
|
"queryType": "traceql",
|
||||||
"query": "{resource.service.name=\"xrpld\" && span.xrpl.rpc.command != \"\"} | histogram_over_time(duration) by (span.xrpl.rpc.command)"
|
"query": "{resource.service.name=\"xrpld\" && span.command != \"\"} | histogram_over_time(duration) by (span.command)"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }
|
||||||
@@ -760,7 +760,7 @@ providers:
|
|||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"queryType": "traceql",
|
"queryType": "traceql",
|
||||||
"query": "{resource.service.name=\"xrpld\" && status.code=error} | rate() by (span.xrpl.rpc.command)"
|
"query": "{resource.service.name=\"xrpld\" && status.code=error} | rate() by (span.command)"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }
|
||||||
@@ -772,7 +772,7 @@ providers:
|
|||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"queryType": "traceql",
|
"queryType": "traceql",
|
||||||
"query": "{resource.service.name=\"xrpld\" && span.xrpl.rpc.command != \"\"} | avg(duration) by (span.xrpl.rpc.command) | topk(10)"
|
"query": "{resource.service.name=\"xrpld\" && span.command != \"\"} | avg(duration) by (span.command) | topk(10)"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 }
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 }
|
||||||
|
|||||||
@@ -165,15 +165,15 @@ Every span can carry key-value attributes that provide context for filtering and
|
|||||||
#### RPC Attributes
|
#### RPC Attributes
|
||||||
|
|
||||||
| Attribute | Type | Set On | Description |
|
| Attribute | Type | Set On | Description |
|
||||||
| ------------------------ | ------ | --------------- | ------------------------------------------------ |
|
| --------------- | ------ | --------------- | ------------------------------------------------ |
|
||||||
| `xrpl.rpc.command` | string | `rpc.command.*` | RPC command name (e.g., `server_info`, `ledger`) |
|
| `command` | string | `rpc.command.*` | RPC command name (e.g., `server_info`, `ledger`) |
|
||||||
| `xrpl.rpc.version` | int64 | `rpc.command.*` | API version number |
|
| `version` | int64 | `rpc.command.*` | API version number |
|
||||||
| `xrpl.rpc.role` | string | `rpc.command.*` | Caller role: `"admin"` or `"user"` |
|
| `rpc_role` | string | `rpc.command.*` | Caller role: `"admin"` or `"user"` |
|
||||||
| `xrpl.rpc.status` | string | `rpc.command.*` | Result: `"success"` or `"error"` |
|
| `rpc_status` | string | `rpc.command.*` | Result: `"success"` or `"error"` |
|
||||||
| `xrpl.rpc.duration_ms` | int64 | `rpc.command.*` | Command execution time in milliseconds |
|
| `duration_ms` | int64 | `rpc.command.*` | Command execution time in milliseconds |
|
||||||
| `xrpl.rpc.error_message` | string | `rpc.command.*` | Error details (only set on failure) |
|
| `error_message` | string | `rpc.command.*` | Error details (only set on failure) |
|
||||||
|
|
||||||
**Tempo query**: `{span.xrpl.rpc.command="server_info"}` to find all `server_info` calls.
|
**Tempo query**: `{span.command="server_info"}` to find all `server_info` calls.
|
||||||
|
|
||||||
**Prometheus label**: `xrpl_rpc_command` (dots converted to underscores by SpanMetrics).
|
**Prometheus label**: `xrpl_rpc_command` (dots converted to underscores by SpanMetrics).
|
||||||
|
|
||||||
@@ -252,8 +252,8 @@ The OTel Collector's SpanMetrics connector automatically generates RED (Rate, Er
|
|||||||
|
|
||||||
| Span Attribute | Prometheus Label | Applies To |
|
| Span Attribute | Prometheus Label | Applies To |
|
||||||
| ------------------------------ | ------------------------------ | ------------------------- |
|
| ------------------------------ | ------------------------------ | ------------------------- |
|
||||||
| `xrpl.rpc.command` | `xrpl_rpc_command` | `rpc.command.*` |
|
| `command` | `xrpl_rpc_command` | `rpc.command.*` |
|
||||||
| `xrpl.rpc.status` | `xrpl_rpc_status` | `rpc.command.*` |
|
| `rpc_status` | `xrpl_rpc_status` | `rpc.command.*` |
|
||||||
| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` |
|
| `xrpl.consensus.mode` | `xrpl_consensus_mode` | `consensus.ledger_close` |
|
||||||
| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` |
|
| `xrpl.tx.local` | `xrpl_tx_local` | `tx.process` |
|
||||||
| `xrpl.peer.proposal.trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` |
|
| `xrpl.peer.proposal.trusted` | `xrpl_peer_proposal_trusted` | `peer.proposal.receive` |
|
||||||
@@ -415,7 +415,7 @@ For each of the 45+ overlay traffic categories (defined in `TrafficCount.h`), fo
|
|||||||
| All RPC calls | `{resource.service.name="xrpld" && name="rpc.request"}` |
|
| All RPC calls | `{resource.service.name="xrpld" && name="rpc.request"}` |
|
||||||
| Specific RPC command | `{resource.service.name="xrpld" && name="rpc.command.server_info"}` |
|
| Specific RPC command | `{resource.service.name="xrpld" && name="rpc.command.server_info"}` |
|
||||||
| Slow RPC calls | `{resource.service.name="xrpld" && name=~"rpc.command.*"} \| duration > 100ms` |
|
| Slow RPC calls | `{resource.service.name="xrpld" && name=~"rpc.command.*"} \| duration > 100ms` |
|
||||||
| Failed RPC calls | `{span.xrpl.rpc.status="error"}` |
|
| Failed RPC calls | `{span.rpc_status="error"}` |
|
||||||
| Specific transaction | `{span.xrpl.tx.hash="<hex_hash>"}` |
|
| Specific transaction | `{span.xrpl.tx.hash="<hex_hash>"}` |
|
||||||
| Local transactions only | `{span.xrpl.tx.local=true}` |
|
| Local transactions only | `{span.xrpl.tx.local=true}` |
|
||||||
| Consensus rounds | `{resource.service.name="xrpld" && name="consensus.accept"}` |
|
| Consensus rounds | `{resource.service.name="xrpld" && name="consensus.accept"}` |
|
||||||
|
|||||||
@@ -302,8 +302,8 @@
|
|||||||
// Each factory checks the global Telemetry instance internally.
|
// Each factory checks the global Telemetry instance internally.
|
||||||
// No Telemetry& reference needed at the call site.
|
// No Telemetry& reference needed at the call site.
|
||||||
auto span = telemetry::SpanGuard::rpcSpan("rpc.request");
|
auto span = telemetry::SpanGuard::rpcSpan("rpc.request");
|
||||||
span.setAttribute("xrpl.rpc.command", command);
|
span.setAttribute("command", command);
|
||||||
span.setAttribute("xrpl.rpc.status", status);
|
span.setAttribute("rpc_status", status);
|
||||||
```
|
```
|
||||||
|
|
||||||
- Factory methods: `rpcSpan()`, `txSpan()`, `consensusSpan()`, `peerSpan()`, `ledgerSpan()`, `span()`
|
- Factory methods: `rpcSpan()`, `txSpan()`, `consensusSpan()`, `peerSpan()`, `ledgerSpan()`, `span()`
|
||||||
@@ -336,12 +336,12 @@
|
|||||||
- `#include <xrpl/telemetry/SpanGuard.h>`
|
- `#include <xrpl/telemetry/SpanGuard.h>`
|
||||||
- In `ServerHandler::onRequest(Session& session)`:
|
- In `ServerHandler::onRequest(Session& session)`:
|
||||||
- At the top of the method, add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.request");`
|
- At the top of the method, add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.request");`
|
||||||
- After the RPC command name is extracted, set attribute: `span.setAttribute("xrpl.rpc.command", command);`
|
- After the RPC command name is extracted, set attribute: `span.setAttribute("command", command);`
|
||||||
- After the response status is known, set: `span.setAttribute("http.status_code", static_cast<int64_t>(statusCode));`
|
- After the response status is known, set: `span.setAttribute("http.status_code", static_cast<int64_t>(statusCode));`
|
||||||
- Wrap error paths with: `span.recordException(e);`
|
- Wrap error paths with: `span.recordException(e);`
|
||||||
- In `ServerHandler::processRequest(...)`:
|
- In `ServerHandler::processRequest(...)`:
|
||||||
- Add a child span: `auto span = telemetry::SpanGuard::rpcSpan("rpc.process");`
|
- Add a child span: `auto span = telemetry::SpanGuard::rpcSpan("rpc.process");`
|
||||||
- Set method attribute: `span.setAttribute("xrpl.rpc.method", request_method);`
|
- Set method attribute: `span.setAttribute("method", request_method);`
|
||||||
- In `ServerHandler::onWSMessage(...)` (WebSocket path):
|
- In `ServerHandler::onWSMessage(...)` (WebSocket path):
|
||||||
- Add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.ws.message");`
|
- Add: `auto span = telemetry::SpanGuard::rpcSpan("rpc.ws.message");`
|
||||||
|
|
||||||
@@ -362,7 +362,7 @@
|
|||||||
- [01-architecture-analysis.md §1.5](./01-architecture-analysis.md) — RPC request flow diagram: HTTP request -> attributes -> jobqueue.enqueue -> rpc.command -> response
|
- [01-architecture-analysis.md §1.5](./01-architecture-analysis.md) — RPC request flow diagram: HTTP request -> attributes -> jobqueue.enqueue -> rpc.command -> response
|
||||||
- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.request` in `ServerHandler.cpp::onRequest()` (Priority: High)
|
- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.request` in `ServerHandler.cpp::onRequest()` (Priority: High)
|
||||||
- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming convention: `rpc.request`, `rpc.command.*`
|
- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming convention: `rpc.request`, `rpc.command.*`
|
||||||
- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.params`
|
- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC span attributes: `command`, `version`, `rpc_role`, `xrpl.rpc.params`
|
||||||
- [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact: `ServerHandler.cpp` ~40 lines added, ~10 changed (Low risk)
|
- [03-implementation-strategy.md §3.9.2](./03-implementation-strategy.md) — File impact: `ServerHandler.cpp` ~40 lines added, ~10 changed (Low risk)
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -378,17 +378,17 @@
|
|||||||
- In `doCommand(RPC::JsonContext& context, Json::Value& result)`:
|
- In `doCommand(RPC::JsonContext& context, Json::Value& result)`:
|
||||||
- At the top: `auto span = telemetry::SpanGuard::rpcSpan("rpc.command." + context.method);`
|
- At the top: `auto span = telemetry::SpanGuard::rpcSpan("rpc.command." + context.method);`
|
||||||
- Set attributes:
|
- Set attributes:
|
||||||
- `span.setAttribute("xrpl.rpc.command", context.method);`
|
- `span.setAttribute("command", context.method);`
|
||||||
- `span.setAttribute("xrpl.rpc.version", static_cast<int64_t>(context.apiVersion));`
|
- `span.setAttribute("version", static_cast<int64_t>(context.apiVersion));`
|
||||||
- `span.setAttribute("xrpl.rpc.role", (context.role == Role::ADMIN) ? "admin" : "user");`
|
- `span.setAttribute("rpc_role", (context.role == Role::ADMIN) ? "admin" : "user");`
|
||||||
- On success: `span.setAttribute("xrpl.rpc.status", "success");`
|
- On success: `span.setAttribute("rpc_status", "success");`
|
||||||
- On error: `span.setAttribute("xrpl.rpc.status", "error");` and set the error message
|
- On error: `span.setAttribute("rpc_status", "error");` and set the error message
|
||||||
|
|
||||||
- After this, traces in Tempo/Grafana should look like:
|
- After this, traces in Tempo/Grafana should look like:
|
||||||
```
|
```
|
||||||
rpc.request (xrpl.rpc.command=account_info)
|
rpc.request (command=account_info)
|
||||||
└── rpc.process
|
└── rpc.process
|
||||||
└── rpc.command.account_info (xrpl.rpc.version=2, xrpl.rpc.role=user, xrpl.rpc.status=success)
|
└── rpc.command.account_info (version=2, rpc_role=user, rpc_status=success)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Key modified file**:
|
**Key modified file**:
|
||||||
@@ -399,7 +399,7 @@
|
|||||||
|
|
||||||
- [04-code-samples.md §4.5.3](./04-code-samples.md) — `ServerHandler::onRequest()` code sample (includes child span pattern for `rpc.command.*`)
|
- [04-code-samples.md §4.5.3](./04-code-samples.md) — `ServerHandler::onRequest()` code sample (includes child span pattern for `rpc.command.*`)
|
||||||
- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming: `rpc.command.*` pattern with dynamic command name (e.g., `rpc.command.server_info`)
|
- [02-design-decisions.md §2.3](./02-design-decisions.md) — Span naming: `rpc.command.*` pattern with dynamic command name (e.g., `rpc.command.server_info`)
|
||||||
- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `xrpl.rpc.command`, `xrpl.rpc.version`, `xrpl.rpc.role`, `xrpl.rpc.status`
|
- [02-design-decisions.md §2.4.2](./02-design-decisions.md) — RPC attribute schema: `command`, `version`, `rpc_role`, `rpc_status`
|
||||||
- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.command.*` in `RPCHandler.cpp::doCommand()` (Priority: High)
|
- [01-architecture-analysis.md §1.6](./01-architecture-analysis.md) — Key trace points table: `rpc.command.*` in `RPCHandler.cpp::doCommand()` (Priority: High)
|
||||||
- [02-design-decisions.md §2.6.5](./02-design-decisions.md) — Correlation with PerfLog: how `doCommand()` can link trace_id with existing PerfLog entries
|
- [02-design-decisions.md §2.6.5](./02-design-decisions.md) — Correlation with PerfLog: how `doCommand()` can link trace_id with existing PerfLog entries
|
||||||
- [03-implementation-strategy.md §3.4.4](./03-implementation-strategy.md) — RPC request overhead budget: ~1.75 μs total per request
|
- [03-implementation-strategy.md §3.4.4](./03-implementation-strategy.md) — RPC request overhead budget: ~1.75 μs total per request
|
||||||
@@ -472,7 +472,7 @@
|
|||||||
- Navigate to Explore → select Tempo datasource
|
- Navigate to Explore → select Tempo datasource
|
||||||
- Search for service `xrpld`
|
- Search for service `xrpld`
|
||||||
- Confirm you see traces with spans: `rpc.request` -> `rpc.process` -> `rpc.command.server_info`
|
- Confirm you see traces with spans: `rpc.request` -> `rpc.process` -> `rpc.command.server_info`
|
||||||
- Click into a trace and verify attributes: `xrpl.rpc.command`, `xrpl.rpc.status`, `xrpl.rpc.version`
|
- Click into a trace and verify attributes: `command`, `rpc_status`, `version`
|
||||||
|
|
||||||
7. **Verify zero-overhead when disabled**:
|
7. **Verify zero-overhead when disabled**:
|
||||||
- Rebuild with `XRPL_ENABLE_TELEMETRY=OFF`, or set `enabled=0` in config
|
- Rebuild with `XRPL_ENABLE_TELEMETRY=OFF`, or set `enabled=0` in config
|
||||||
@@ -486,7 +486,7 @@
|
|||||||
- [ ] xrpld starts and connects to OTel Collector (check xrpld logs for telemetry messages)
|
- [ ] xrpld starts and connects to OTel Collector (check xrpld logs for telemetry messages)
|
||||||
- [ ] Traces appear in Grafana/Tempo under service "xrpld"
|
- [ ] Traces appear in Grafana/Tempo under service "xrpld"
|
||||||
- [ ] Span hierarchy is correct (parent-child relationships)
|
- [ ] Span hierarchy is correct (parent-child relationships)
|
||||||
- [ ] Span attributes are populated (`xrpl.rpc.command`, `xrpl.rpc.status`, etc.)
|
- [ ] Span attributes are populated (`command`, `rpc_status`, etc.)
|
||||||
- [ ] Error spans show error status and message
|
- [ ] Error spans show error status and message
|
||||||
- [ ] Building with `XRPL_ENABLE_TELEMETRY=OFF` produces no regressions
|
- [ ] Building with `XRPL_ENABLE_TELEMETRY=OFF` produces no regressions
|
||||||
- [ ] Setting `enabled=0` at runtime produces no traces and no errors
|
- [ ] Setting `enabled=0` at runtime produces no traces and no errors
|
||||||
@@ -572,8 +572,8 @@ The current POC exports **traces only**. Grafana's Explore view can query Tempo
|
|||||||
explicit:
|
explicit:
|
||||||
buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]
|
buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]
|
||||||
dimensions:
|
dimensions:
|
||||||
- name: xrpl.rpc.command
|
- name: command
|
||||||
- name: xrpl.rpc.status
|
- name: rpc_status
|
||||||
|
|
||||||
exporters:
|
exporters:
|
||||||
prometheus:
|
prometheus:
|
||||||
|
|||||||
@@ -91,7 +91,7 @@
|
|||||||
|
|
||||||
- `http.method` is always POST for JSON-RPC
|
- `http.method` is always POST for JSON-RPC
|
||||||
- `net.peer.ip` is debug-level info available in logs
|
- `net.peer.ip` is debug-level info available in logs
|
||||||
- `xrpl.rpc.duration_ms` is redundant with span duration (OTel captures start/end time natively)
|
- `duration_ms` is redundant with span duration (OTel captures start/end time natively)
|
||||||
|
|
||||||
These can be added later if dashboard queries specifically need them. The node health attributes (Task 2.8) provide far more operational value and were prioritized instead.
|
These can be added later if dashboard queries specifically need them. The node health attributes (Task 2.8) provide far more operational value and were prioritized instead.
|
||||||
|
|
||||||
@@ -130,9 +130,8 @@ These can be added later if dashboard queries specifically need them. The node h
|
|||||||
**What to do**:
|
**What to do**:
|
||||||
|
|
||||||
- Edit `src/xrpld/rpc/detail/RPCHandler.cpp`:
|
- Edit `src/xrpld/rpc/detail/RPCHandler.cpp`:
|
||||||
- In the `rpc.command.*` span creation block (after existing `setAttribute` calls for `xrpl.rpc.command`, `xrpl.rpc.version`, etc.):
|
- In the `rpc.command.*` span creation block (after existing `setAttribute` calls for `command`, `version`, etc.):
|
||||||
- Add `xrpl.node.amendment_blocked` (bool) — from `context.app.getOPs().isAmendmentBlocked()`
|
- Node health attrs (`xrpl.node.amendment_blocked`, `xrpl.node.server_state`) are now resource-level attrs, not per-span. They are set at Tracer init.
|
||||||
- Add `xrpl.node.server_state` (string) — from `context.app.getOPs().strOperatingMode()`
|
|
||||||
|
|
||||||
**New span attributes**:
|
**New span attributes**:
|
||||||
|
|
||||||
|
|||||||
@@ -89,13 +89,13 @@
|
|||||||
- In `onMessage(TMTransaction)` / `handleTransaction()`:
|
- In `onMessage(TMTransaction)` / `handleTransaction()`:
|
||||||
- Extract parent trace context from incoming `TMTransaction::trace_context` field (if present)
|
- Extract parent trace context from incoming `TMTransaction::trace_context` field (if present)
|
||||||
- Create `tx.receive` span as child of extracted context (or new root if none)
|
- Create `tx.receive` span as child of extracted context (or new root if none)
|
||||||
- Set attributes: `xrpl.tx.hash`, `xrpl.peer.id`, `xrpl.tx.status`
|
- Set attributes: `xrpl.tx.hash`, `xrpl.peer.id`, `tx_status`
|
||||||
- On HashRouter suppression (duplicate): set `xrpl.tx.suppressed=true`, add `tx.duplicate` event
|
- On HashRouter suppression (duplicate): set `suppressed=true`, add `tx.duplicate` event
|
||||||
- Wrap validation call with child span `tx.validate`
|
- Wrap validation call with child span `tx.validate`
|
||||||
- Wrap relay with `tx.relay` span
|
- Wrap relay with `tx.relay` span
|
||||||
- When relaying to peers:
|
- When relaying to peers:
|
||||||
- Inject current trace context into outgoing `TMTransaction::trace_context`
|
- Inject current trace context into outgoing `TMTransaction::trace_context`
|
||||||
- Set `xrpl.tx.relay_count` attribute
|
- Set `relay_count` attribute
|
||||||
|
|
||||||
- Use `SpanGuard::span(TraceCategory::Transactions, "tx", "receive")` factory
|
- Use `SpanGuard::span(TraceCategory::Transactions, "tx", "receive")` factory
|
||||||
(Phase 1c replaced macros with the SpanGuard factory pattern)
|
(Phase 1c replaced macros with the SpanGuard factory pattern)
|
||||||
@@ -121,7 +121,7 @@
|
|||||||
- Edit `src/xrpld/app/misc/NetworkOPs.cpp`:
|
- Edit `src/xrpld/app/misc/NetworkOPs.cpp`:
|
||||||
- In `processTransaction()`:
|
- In `processTransaction()`:
|
||||||
- Create `tx.process` span
|
- Create `tx.process` span
|
||||||
- Set attributes: `xrpl.tx.hash`, `xrpl.tx.type`, `xrpl.tx.local` (whether from RPC or peer)
|
- Set attributes: `xrpl.tx.hash`, `tx_type`, `local` (whether from RPC or peer)
|
||||||
- Record whether sync or async path is taken
|
- Record whether sync or async path is taken
|
||||||
|
|
||||||
- In `doTransactionAsync()`:
|
- In `doTransactionAsync()`:
|
||||||
@@ -152,8 +152,8 @@
|
|||||||
|
|
||||||
- Edit `src/xrpld/overlay/detail/PeerImp.cpp` (in handleTransaction):
|
- Edit `src/xrpld/overlay/detail/PeerImp.cpp` (in handleTransaction):
|
||||||
- After calling `HashRouter::shouldProcess()` or `addSuppressionPeer()`:
|
- After calling `HashRouter::shouldProcess()` or `addSuppressionPeer()`:
|
||||||
- Record `xrpl.tx.suppressed` attribute (true/false)
|
- Record `suppressed` attribute (true/false)
|
||||||
- Record `xrpl.tx.flags` showing current HashRouter state (SAVED, TRUSTED, etc.)
|
- Record `tx_flags` showing current HashRouter state (SAVED, TRUSTED, etc.)
|
||||||
- Add `tx.first_seen` or `tx.duplicate` event
|
- Add `tx.first_seen` or `tx.duplicate` event
|
||||||
|
|
||||||
- This is NOT a modification to HashRouter itself — just recording its decisions as span attributes in the existing PeerImp instrumentation from Task 3.3.
|
- This is NOT a modification to HashRouter itself — just recording its decisions as span attributes in the existing PeerImp instrumentation from Task 3.3.
|
||||||
@@ -257,14 +257,14 @@
|
|||||||
|
|
||||||
- Edit `src/xrpld/overlay/detail/PeerImp.cpp`:
|
- Edit `src/xrpld/overlay/detail/PeerImp.cpp`:
|
||||||
- In the `tx.receive` span block (after existing `xrpl.peer.id` setAttribute call):
|
- In the `tx.receive` span block (after existing `xrpl.peer.id` setAttribute call):
|
||||||
- Add `xrpl.peer.version` (string) — from `this->getVersion()`
|
- Add `peer_version` (string) — from `this->getVersion()`
|
||||||
- Only set if `getVersion()` returns a non-empty string (avoid empty-string attributes)
|
- Only set if `getVersion()` returns a non-empty string (avoid empty-string attributes)
|
||||||
|
|
||||||
**New span attribute**:
|
**New span attribute**:
|
||||||
|
|
||||||
| Attribute | Type | Source | Example |
|
| Attribute | Type | Source | Example |
|
||||||
| ------------------- | ------ | -------------------- | --------------- |
|
| -------------- | ------ | -------------------- | --------------- |
|
||||||
| `xrpl.peer.version` | string | `peer->getVersion()` | `"xrpld-2.4.0"` |
|
| `peer_version` | string | `peer->getVersion()` | `"xrpld-2.4.0"` |
|
||||||
|
|
||||||
**Rationale**: Transaction relay is where version mismatches cause subtle serialization or validation bugs. Tracing "this tx came from a v2.3.0 peer" helps diagnose compatibility issues. The community dashboard tracks peer versions externally; this brings version awareness into the trace itself.
|
**Rationale**: Transaction relay is where version mismatches cause subtle serialization or validation bugs. Tracing "this tx came from a v2.3.0 peer" helps diagnose compatibility issues. The community dashboard tracks peer versions externally; this brings version awareness into the trace itself.
|
||||||
|
|
||||||
@@ -274,7 +274,7 @@
|
|||||||
|
|
||||||
**Exit Criteria**:
|
**Exit Criteria**:
|
||||||
|
|
||||||
- [ ] `tx.receive` spans carry `xrpl.peer.version` attribute with a non-empty version string
|
- [ ] `tx.receive` spans carry `peer_version` attribute with a non-empty version string
|
||||||
- [ ] Attribute is omitted (not set to empty string) when `getVersion()` returns empty
|
- [ ] Attribute is omitted (not set to empty string) when `getVersion()` returns empty
|
||||||
- [ ] Attribute visible in Jaeger span detail view
|
- [ ] Attribute visible in Jaeger span detail view
|
||||||
|
|
||||||
@@ -387,8 +387,8 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ
|
|||||||
- No protobuf context to extract here (NetworkOPs is intra-node), so
|
- No protobuf context to extract here (NetworkOPs is intra-node), so
|
||||||
deterministic context alone is sufficient.
|
deterministic context alone is sufficient.
|
||||||
|
|
||||||
- Add `tx_trace_strategy` attribute to spans:
|
- Add `trace_strategy` attribute to spans:
|
||||||
- Add `inline constexpr auto traceStrategy = join(xrplTx, makeStr("trace_strategy"));`
|
- Add `inline constexpr auto traceStrategy = "trace_strategy";`
|
||||||
to `TxSpanNames.h`.
|
to `TxSpanNames.h`.
|
||||||
- Set on each tx span: `span.setAttribute(tx_span::attr::traceStrategy, "deterministic")`.
|
- Set on each tx span: `span.setAttribute(tx_span::attr::traceStrategy, "deterministic")`.
|
||||||
|
|
||||||
@@ -419,7 +419,7 @@ This gives the best of both worlds: guaranteed cross-node correlation via determ
|
|||||||
- [ ] All nodes handling the same transaction produce spans under the same trace_id
|
- [ ] All nodes handling the same transaction produce spans under the same trace_id
|
||||||
- [x] Protobuf `span_id` propagation still works when available (parent-child ordering)
|
- [x] Protobuf `span_id` propagation still works when available (parent-child ordering)
|
||||||
- [ ] Missing protobuf context (old peer) degrades gracefully to sibling spans, not lost traces
|
- [ ] Missing protobuf context (old peer) degrades gracefully to sibling spans, not lost traces
|
||||||
- [ ] `xrpl.tx.trace_strategy` attribute set to `"deterministic"` on all tx spans
|
- [ ] `trace_strategy` attribute set to `"deterministic"` on all tx spans
|
||||||
- [ ] Trace queryable by tx hash (truncate hash → trace_id → direct lookup in Tempo)
|
- [ ] Trace queryable by tx hash (truncate hash → trace_id → direct lookup in Tempo)
|
||||||
|
|
||||||
**Deliverables implemented (not in original plan)**:
|
**Deliverables implemented (not in original plan)**:
|
||||||
|
|||||||
@@ -27,8 +27,8 @@
|
|||||||
|
|
||||||
- `RCLConsensus::Adaptor::startRoundTracing()` creates `consensus.round` span
|
- `RCLConsensus::Adaptor::startRoundTracing()` creates `consensus.round` span
|
||||||
via `SpanGuard::hashSpan()` (deterministic) or `SpanGuard::span()` (attribute strategy)
|
via `SpanGuard::hashSpan()` (deterministic) or `SpanGuard::span()` (attribute strategy)
|
||||||
- Attributes set: `xrpl.consensus.ledger_id`, `xrpl.consensus.ledger.seq`,
|
- Attributes set: `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`,
|
||||||
`xrpl.consensus.mode`, `xrpl.consensus.trace_strategy`, `xrpl.consensus.round_id`
|
`xrpl.consensus.mode`, `trace_strategy`, `xrpl.consensus.round_id`
|
||||||
- Round span stored as `roundSpan_` member in `RCLConsensus::Adaptor`
|
- Round span stored as `roundSpan_` member in `RCLConsensus::Adaptor`
|
||||||
- `roundSpanContext_` snapshot captured for cross-thread span linking
|
- `roundSpanContext_` snapshot captured for cross-thread span linking
|
||||||
|
|
||||||
@@ -57,9 +57,9 @@
|
|||||||
|
|
||||||
**Design notes**:
|
**Design notes**:
|
||||||
|
|
||||||
- `xrpl.consensus.phase` attribute — phases are distinguished by span names instead
|
- `phase` attribute — phases are distinguished by span names instead
|
||||||
- `phase.enter` / `phase.exit` events — not added (span start/end serves this purpose)
|
- `phase.enter` / `phase.exit` events — not added (span start/end serves this purpose)
|
||||||
- `xrpl.consensus.phase_duration_ms` attribute — not set (span duration captures this)
|
- `phase_duration_ms` attribute — not set (span duration captures this)
|
||||||
|
|
||||||
**Key modified files**:
|
**Key modified files**:
|
||||||
|
|
||||||
@@ -82,11 +82,11 @@
|
|||||||
|
|
||||||
- In `Adaptor::propose()`:
|
- In `Adaptor::propose()`:
|
||||||
- Creates `consensus.proposal.send` span via `SpanGuard::span()`
|
- Creates `consensus.proposal.send` span via `SpanGuard::span()`
|
||||||
- Sets `xrpl.consensus.round` attribute
|
- Sets `xrpl.consensus.round` attribute (kept — rule 5)
|
||||||
|
|
||||||
- In `PeerImp::onMessage(TMProposeSet)`:
|
- In `PeerImp::onMessage(TMProposeSet)`:
|
||||||
- Creates `consensus.proposal.receive` span
|
- Creates `consensus.proposal.receive` span
|
||||||
- Sets `xrpl.consensus.proposal.trusted` attribute (bool)
|
- Sets `trusted` attribute (bool)
|
||||||
|
|
||||||
**Not implemented** (deferred to Phase 4b — cross-node propagation):
|
**Not implemented** (deferred to Phase 4b — cross-node propagation):
|
||||||
|
|
||||||
@@ -117,12 +117,12 @@
|
|||||||
- Uses `SpanGuard::linkedSpan()` to create a follows-from link to the round span
|
- Uses `SpanGuard::linkedSpan()` to create a follows-from link to the round span
|
||||||
- Thread-safe: uses `roundSpanContext_` snapshot (captured on consensus thread,
|
- Thread-safe: uses `roundSpanContext_` snapshot (captured on consensus thread,
|
||||||
read on jtACCEPT thread)
|
read on jtACCEPT thread)
|
||||||
- Sets `xrpl.consensus.ledger.seq` and `xrpl.consensus.proposing` attributes
|
- Sets `xrpl.ledger.seq` and `proposing` attributes
|
||||||
|
|
||||||
- In `PeerImp::onMessage(TMValidation)`:
|
- In `PeerImp::onMessage(TMValidation)`:
|
||||||
- Creates `consensus.validation.receive` span
|
- Creates `consensus.validation.receive` span
|
||||||
- Sets `xrpl.consensus.validation.trusted` attribute (bool)
|
- Sets `trusted` attribute (bool)
|
||||||
- Sets `xrpl.consensus.validation.ledger_seq` attribute
|
- Sets `xrpl.ledger.seq` attribute
|
||||||
|
|
||||||
**Not implemented** (deferred to Phase 4b — cross-node propagation):
|
**Not implemented** (deferred to Phase 4b — cross-node propagation):
|
||||||
|
|
||||||
@@ -142,18 +142,18 @@
|
|||||||
|
|
||||||
**Implemented attributes** (across various spans):
|
**Implemented attributes** (across various spans):
|
||||||
|
|
||||||
- `xrpl.consensus.ledger.seq` — on `consensus.round`, `consensus.accept.apply`
|
- `xrpl.ledger.seq` — on `consensus.round`, `consensus.accept.apply`
|
||||||
- `xrpl.consensus.round` — on `consensus.proposal.send`
|
- `xrpl.consensus.round` — on `consensus.proposal.send`
|
||||||
- `xrpl.consensus.mode` — on `consensus.round`, `consensus.ledger_close`
|
- `xrpl.consensus.mode` — on `consensus.round`, `consensus.ledger_close`
|
||||||
- `xrpl.consensus.proposers` — on `consensus.accept`, `consensus.establish`, `consensus.update_positions`
|
- `proposers` — on `consensus.accept`, `consensus.establish`, `consensus.update_positions`
|
||||||
- `xrpl.consensus.converge_percent` — on `consensus.establish`, `consensus.update_positions`, `consensus.check`
|
- `converge_percent` — on `consensus.establish`, `consensus.update_positions`, `consensus.check`
|
||||||
- `xrpl.consensus.tx_count` — on `consensus.accept.apply` span (in `doAccept()`)
|
- `tx_count` — on `consensus.accept.apply` span (in `doAccept()`)
|
||||||
- `xrpl.consensus.disputes_count` — on `consensus.update_positions` span (in `updateOurPositions()`)
|
- `disputes_count` — on `consensus.update_positions` span (in `updateOurPositions()`)
|
||||||
|
|
||||||
**Design notes**:
|
**Design notes**:
|
||||||
|
|
||||||
- `xrpl.consensus.phase` — phases distinguished by span names instead
|
- `phase` — phases distinguished by span names instead
|
||||||
- `xrpl.consensus.phase_duration_ms` — span duration captures this
|
- `phase_duration_ms` — span duration captures this
|
||||||
|
|
||||||
**Key modified files**:
|
**Key modified files**:
|
||||||
|
|
||||||
@@ -221,8 +221,8 @@
|
|||||||
- Add `xrpl.validation.ledger_hash` (string) — the ledger hash being validated
|
- Add `xrpl.validation.ledger_hash` (string) — the ledger hash being validated
|
||||||
- Add `xrpl.validation.full` (bool) — whether this is a full validation (not partial)
|
- Add `xrpl.validation.full` (bool) — whether this is a full validation (not partial)
|
||||||
- On the `consensus.accept` span (in `onAccept()`):
|
- On the `consensus.accept` span (in `onAccept()`):
|
||||||
- Add `xrpl.consensus.validation_quorum` (int64) — from `app_.validators().quorum()`
|
- Add `validation_quorum` (int64) — from `app_.validators().quorum()`
|
||||||
- Add `xrpl.consensus.proposers_validated` (int64) — from `result.proposers`
|
- Add `proposers_validated` (int64) — from `result.proposers`
|
||||||
|
|
||||||
- Edit `src/xrpld/overlay/detail/PeerImp.cpp`:
|
- Edit `src/xrpld/overlay/detail/PeerImp.cpp`:
|
||||||
- On the `peer.validation.receive` span:
|
- On the `peer.validation.receive` span:
|
||||||
@@ -232,13 +232,13 @@
|
|||||||
**New span attributes**:
|
**New span attributes**:
|
||||||
|
|
||||||
| Span | Attribute | Type | Source |
|
| Span | Attribute | Type | Source |
|
||||||
| --------------------------- | ------------------------------------ | ------ | --------------------------------- |
|
| --------------------------- | ---------------------------------- | ------ | --------------------------------- |
|
||||||
| `consensus.validation.send` | `xrpl.validation.ledger_hash` | string | Ledger hash from validate() args |
|
| `consensus.validation.send` | `xrpl.validation.ledger_hash` | string | Ledger hash from validate() args |
|
||||||
| `consensus.validation.send` | `xrpl.validation.full` | bool | Full vs partial validation |
|
| `consensus.validation.send` | `xrpl.validation.full` | bool | Full vs partial validation |
|
||||||
| `peer.validation.receive` | `xrpl.peer.validation.ledger_hash` | string | From STValidation deserialization |
|
| `peer.validation.receive` | `xrpl.peer.validation.ledger_hash` | string | From STValidation deserialization |
|
||||||
| `peer.validation.receive` | `xrpl.peer.validation.full` | bool | From STValidation flags |
|
| `peer.validation.receive` | `xrpl.peer.validation.full` | bool | From STValidation flags |
|
||||||
| `consensus.accept` | `xrpl.consensus.validation_quorum` | int64 | `app_.validators().quorum()` |
|
| `consensus.accept` | `validation_quorum` | int64 | `app_.validators().quorum()` |
|
||||||
| `consensus.accept` | `xrpl.consensus.proposers_validated` | int64 | `result.proposers` |
|
| `consensus.accept` | `proposers_validated` | int64 | `result.proposers` |
|
||||||
|
|
||||||
**Rationale**: The external dashboard's most valuable feature is validation agreement tracking. By recording the ledger hash on both outgoing and incoming validation spans, we create the raw data for agreement analysis at the trace level. Example Tempo query:
|
**Rationale**: The external dashboard's most valuable feature is validation agreement tracking. By recording the ledger hash on both outgoing and incoming validation spans, we create the raw data for agreement analysis at the trace level. Example Tempo query:
|
||||||
|
|
||||||
@@ -257,7 +257,7 @@ Phase 7's `ValidationTracker` builds metric-level aggregation (1h/24h agreement
|
|||||||
|
|
||||||
- [ ] `consensus.validation.send` spans carry `xrpl.validation.ledger_hash` and `xrpl.validation.full`
|
- [ ] `consensus.validation.send` spans carry `xrpl.validation.ledger_hash` and `xrpl.validation.full`
|
||||||
- [ ] `peer.validation.receive` spans carry `xrpl.peer.validation.ledger_hash` and `xrpl.peer.validation.full`
|
- [ ] `peer.validation.receive` spans carry `xrpl.peer.validation.ledger_hash` and `xrpl.peer.validation.full`
|
||||||
- [ ] `consensus.accept` spans carry `xrpl.consensus.validation_quorum` and `xrpl.consensus.proposers_validated`
|
- [ ] `consensus.accept` spans carry `validation_quorum` and `proposers_validated`
|
||||||
- [ ] Ledger hash attributes match between send and receive for the same ledger
|
- [ ] Ledger hash attributes match between send and receive for the same ledger
|
||||||
- [ ] No impact on consensus performance
|
- [ ] No impact on consensus performance
|
||||||
|
|
||||||
@@ -283,26 +283,26 @@ Phase 7's `ValidationTracker` builds metric-level aggregation (1h/24h agreement
|
|||||||
| Span Name | Method | Key Attributes |
|
| Span Name | Method | Key Attributes |
|
||||||
| --------------------------- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `consensus.proposal.send` | `Adaptor::propose` | `xrpl.consensus.round` |
|
| `consensus.proposal.send` | `Adaptor::propose` | `xrpl.consensus.round` |
|
||||||
| `consensus.ledger_close` | `Adaptor::onClose` | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` |
|
| `consensus.ledger_close` | `Adaptor::onClose` | `xrpl.ledger.seq`, `xrpl.consensus.mode` |
|
||||||
| `consensus.accept` | `Adaptor::onAccept` | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` |
|
| `consensus.accept` | `Adaptor::onAccept` | `proposers`, `round_time_ms` |
|
||||||
| `consensus.accept.apply` | `Adaptor::doAccept` | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state`, `proposing`, `round_time_ms`, `ledger.seq`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction` |
|
| `consensus.accept.apply` | `Adaptor::doAccept` | `close_time`, `close_time_correct`, `close_resolution_ms`, `consensus_state`, `proposing`, `round_time_ms`, `xrpl.ledger.seq`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction` |
|
||||||
| `consensus.validation.send` | `Adaptor::onAccept` (via validate) | `xrpl.consensus.proposing` |
|
| `consensus.validation.send` | `Adaptor::onAccept` (via validate) | `proposing` |
|
||||||
|
|
||||||
#### Close Time Attributes (consensus.accept.apply)
|
#### Close Time Attributes (consensus.accept.apply)
|
||||||
|
|
||||||
The `consensus.accept.apply` span captures ledger close time agreement details
|
The `consensus.accept.apply` span captures ledger close time agreement details
|
||||||
driven by `avCT_CONSENSUS_PCT` (75% validator agreement threshold):
|
driven by `avCT_CONSENSUS_PCT` (75% validator agreement threshold):
|
||||||
|
|
||||||
- **`xrpl.consensus.close_time`** — Agreed-upon ledger close time (epoch seconds). When validators disagree (`consensusCloseTime == epoch`), this is synthetically set to `prevCloseTime + 1s`.
|
- **`close_time`** — Agreed-upon ledger close time (epoch seconds). When validators disagree (`consensusCloseTime == epoch`), this is synthetically set to `prevCloseTime + 1s`.
|
||||||
- **`xrpl.consensus.close_time_correct`** — `true` if validators reached agreement, `false` if they "agreed to disagree" (close time forced to prev+1s).
|
- **`close_time_correct`** — `true` if validators reached agreement, `false` if they "agreed to disagree" (close time forced to prev+1s).
|
||||||
- **`xrpl.consensus.close_resolution_ms`** — Rounding granularity for close time (starts at 30s, decreases as ledger interval stabilizes).
|
- **`close_resolution_ms`** — Rounding granularity for close time (starts at 30s, decreases as ledger interval stabilizes).
|
||||||
- **`xrpl.consensus.state`** — `"finished"` (normal) or `"moved_on"` (consensus failed, adopted best available).
|
- **`consensus_state`** — `"finished"` (normal) or `"moved_on"` (consensus failed, adopted best available).
|
||||||
- **`xrpl.consensus.proposing`** — Whether this node was proposing.
|
- **`proposing`** — Whether this node was proposing.
|
||||||
- **`xrpl.consensus.round_time_ms`** — Total consensus round duration.
|
- **`round_time_ms`** — Total consensus round duration.
|
||||||
- **`xrpl.consensus.parent_close_time`** — Previous ledger's close time (epoch seconds). Enables computing close-time deltas across consecutive rounds without correlating separate spans.
|
- **`parent_close_time`** — Previous ledger's close time (epoch seconds). Enables computing close-time deltas across consecutive rounds without correlating separate spans.
|
||||||
- **`xrpl.consensus.close_time_self`** — This node's own proposed close time before consensus voting.
|
- **`close_time_self`** — This node's own proposed close time before consensus voting.
|
||||||
- **`xrpl.consensus.close_time_vote_bins`** — Number of distinct close-time vote bins from peer proposals. Higher values indicate less agreement among validators.
|
- **`close_time_vote_bins`** — Number of distinct close-time vote bins from peer proposals. Higher values indicate less agreement among validators.
|
||||||
- **`xrpl.consensus.resolution_direction`** — Whether close-time resolution `"increased"` (coarser), `"decreased"` (finer), or stayed `"unchanged"` relative to the previous ledger.
|
- **`resolution_direction`** — Whether close-time resolution `"increased"` (coarser), `"decreased"` (finer), or stayed `"unchanged"` relative to the previous ledger.
|
||||||
|
|
||||||
**Exit Criteria** (from [06-implementation-phases.md §6.11.4](./06-implementation-phases.md)):
|
**Exit Criteria** (from [06-implementation-phases.md §6.11.4](./06-implementation-phases.md)):
|
||||||
|
|
||||||
@@ -504,7 +504,7 @@ spans in `Consensus.h`.
|
|||||||
- Reads `consensus_trace_strategy` via `app_.getTelemetry().getConsensusTraceStrategy()`
|
- Reads `consensus_trace_strategy` via `app_.getTelemetry().getConsensusTraceStrategy()`
|
||||||
- **Deterministic**: uses `SpanGuard::hashSpan()` with `prevLgr.id()` data
|
- **Deterministic**: uses `SpanGuard::hashSpan()` with `prevLgr.id()` data
|
||||||
- **Attribute**: uses `SpanGuard::span(TraceCategory::Consensus, seg::consensus, "round")`
|
- **Attribute**: uses `SpanGuard::span(TraceCategory::Consensus, seg::consensus, "round")`
|
||||||
- Sets attributes: `ledger_id`, `ledger.seq`, `mode`, `trace_strategy`, `round_id`
|
- Sets attributes: `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`, `xrpl.consensus.mode`, `trace_strategy`, `xrpl.consensus.round_id`
|
||||||
- Captures `roundSpanContext_` snapshot for cross-thread span linking
|
- Captures `roundSpanContext_` snapshot for cross-thread span linking
|
||||||
- Saves `prevRoundContext_` from previous round for follows-from links
|
- Saves `prevRoundContext_` from previous round for follows-from links
|
||||||
|
|
||||||
@@ -585,9 +585,9 @@ with attributes for convergence progress.
|
|||||||
`SpanGuard::span()` returns a no-op guard when telemetry is disabled.
|
`SpanGuard::span()` returns a no-op guard when telemetry is disabled.
|
||||||
|
|
||||||
- `updateEstablishTracing()` — sets attributes on each `phaseEstablish()` call:
|
- `updateEstablishTracing()` — sets attributes on each `phaseEstablish()` call:
|
||||||
- `xrpl.consensus.converge_percent` — `convergePercent_`
|
- `converge_percent` — `convergePercent_`
|
||||||
- `xrpl.consensus.establish_count` — `establishCounter_`
|
- `establish_count` — `establishCounter_`
|
||||||
- `xrpl.consensus.proposers` — `currPeerPositions_.size()`
|
- `proposers` — `currPeerPositions_.size()`
|
||||||
|
|
||||||
- `endEstablishTracing()` — calls `establishSpan_.reset()` on phase exit.
|
- `endEstablishTracing()` — calls `establishSpan_.reset()` on phase exit.
|
||||||
|
|
||||||
@@ -614,11 +614,11 @@ details.
|
|||||||
```
|
```
|
||||||
|
|
||||||
- Attributes set:
|
- Attributes set:
|
||||||
- `xrpl.consensus.converge_percent` — current convergence
|
- `converge_percent` — current convergence
|
||||||
- `xrpl.consensus.proposers` — `currPeerPositions_.size()`
|
- `proposers` — `currPeerPositions_.size()`
|
||||||
- `xrpl.consensus.have_close_time_consensus` — close time consensus state
|
- `have_close_time_consensus` — close time consensus state
|
||||||
- `xrpl.consensus.close_time_threshold` — `avCT_CONSENSUS_PCT`
|
- `close_time_threshold` — `avCT_CONSENSUS_PCT`
|
||||||
- `xrpl.consensus.disputes_count` — number of active disputes
|
- `disputes_count` — number of active disputes
|
||||||
|
|
||||||
- Dispute events recorded via direct `span.addEvent()` call with yays/nays:
|
- Dispute events recorded via direct `span.addEvent()` call with yays/nays:
|
||||||
```cpp
|
```cpp
|
||||||
@@ -632,7 +632,7 @@ details.
|
|||||||
|
|
||||||
**Not implemented**:
|
**Not implemented**:
|
||||||
|
|
||||||
- `xrpl.consensus.proposers_agreed` / `xrpl.consensus.proposers_total` attributes — not set
|
- `proposers_agreed` / `proposers_total` attributes — not set
|
||||||
|
|
||||||
**Key modified files**:
|
**Key modified files**:
|
||||||
|
|
||||||
@@ -658,13 +658,13 @@ including the avalanche threshold.
|
|||||||
```
|
```
|
||||||
|
|
||||||
- Attributes set:
|
- Attributes set:
|
||||||
- `xrpl.consensus.agree_count` — peers that agree with our position
|
- `agree_count` — peers that agree with our position
|
||||||
- `xrpl.consensus.disagree_count` — peers that disagree
|
- `disagree_count` — peers that disagree
|
||||||
- `xrpl.consensus.converge_percent` — convergence percentage
|
- `converge_percent` — convergence percentage
|
||||||
- `xrpl.consensus.have_close_time_consensus` — close time consensus state
|
- `have_close_time_consensus` — close time consensus state
|
||||||
- `xrpl.consensus.threshold_percent` — set to `avCT_CONSENSUS_PCT` (75%)
|
- `threshold_percent` — set to `avCT_CONSENSUS_PCT` (75%)
|
||||||
- `xrpl.consensus.result` — "yes", "no", or "moved_on"
|
- `consensus_result` — "yes", "no", or "moved_on"
|
||||||
- `xrpl.consensus.avalanche_threshold` — the escalated weight from `getNeededWeight()` on the `consensus.update_positions` span
|
- `avalanche_threshold` — the escalated weight from `getNeededWeight()` on the `consensus.update_positions` span
|
||||||
|
|
||||||
**Key modified files**:
|
**Key modified files**:
|
||||||
|
|
||||||
@@ -687,8 +687,8 @@ wrongLedger, switchedLedger).
|
|||||||
```cpp
|
```cpp
|
||||||
auto span = telemetry::SpanGuard::span(
|
auto span = telemetry::SpanGuard::span(
|
||||||
telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "mode_change");
|
telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "mode_change");
|
||||||
span.setAttribute(cons_span::attr::modeOld, to_string(before).c_str());
|
span.setAttribute(cons_span::attr::modeOld, to_string(before).c_str()); // "mode_old"
|
||||||
span.setAttribute(cons_span::attr::modeNew, to_string(after).c_str());
|
span.setAttribute(cons_span::attr::modeNew, to_string(after).c_str()); // "mode_new"
|
||||||
```
|
```
|
||||||
|
|
||||||
- `MonitoredMode::set()` in `Consensus.h` calls `adaptor_.onModeChange(before, after)`.
|
- `MonitoredMode::set()` in `Consensus.h` calls `adaptor_.onModeChange(before, after)`.
|
||||||
@@ -773,48 +773,48 @@ and OFF, and don't affect consensus timing.
|
|||||||
|
|
||||||
| Span Name | Location | Key Attributes (actually set) |
|
| Span Name | Location | Key Attributes (actually set) |
|
||||||
| ---------------------------- | ------------------ | ----------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------- | ------------------ | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `consensus.round` | `RCLConsensus.cpp` | `round_id`, `ledger_id`, `ledger.seq`, `mode`, `trace_strategy` |
|
| `consensus.round` | `RCLConsensus.cpp` | `xrpl.consensus.round_id`, `xrpl.consensus.ledger_id`, `xrpl.ledger.seq`, `xrpl.consensus.mode`, `trace_strategy` |
|
||||||
| `consensus.establish` | `Consensus.h` | `converge_percent`, `establish_count`, `proposers` |
|
| `consensus.establish` | `Consensus.h` | `converge_percent`, `establish_count`, `proposers` |
|
||||||
| `consensus.update_positions` | `Consensus.h` | `converge_percent`, `proposers`, `have_close_time_consensus`, `close_time_threshold`, `disputes_count`, `avalanche_threshold` |
|
| `consensus.update_positions` | `Consensus.h` | `converge_percent`, `proposers`, `have_close_time_consensus`, `close_time_threshold`, `disputes_count`, `avalanche_threshold` |
|
||||||
| `consensus.check` | `Consensus.h` | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `result` |
|
| `consensus.check` | `Consensus.h` | `agree_count`, `disagree_count`, `converge_percent`, `have_close_time_consensus`, `threshold_percent`, `consensus_result` |
|
||||||
| `consensus.mode_change` | `RCLConsensus.cpp` | `mode.old`, `mode.new` |
|
| `consensus.mode_change` | `RCLConsensus.cpp` | `mode_old`, `mode_new` |
|
||||||
|
|
||||||
### New Events (Phase 4a)
|
### New Events (Phase 4a)
|
||||||
|
|
||||||
| Event Name | Parent Span | Attributes (actually set) |
|
| Event Name | Parent Span | Attributes (actually set) |
|
||||||
| ----------------- | ---------------------------- | ----------------------------------- |
|
| ----------------- | ---------------------------- | ---------------------------------------------------------------- |
|
||||||
| `dispute.resolve` | `consensus.update_positions` | `tx_id`, `our_vote`, `yays`, `nays` |
|
| `dispute.resolve` | `consensus.update_positions` | `xrpl.tx.id`, `dispute_our_vote`, `dispute_yays`, `dispute_nays` |
|
||||||
| `tx.included` | `consensus.accept.apply` | `tx_id` |
|
| `tx.included` | `consensus.accept.apply` | `xrpl.tx.id` |
|
||||||
|
|
||||||
### New Attributes (Phase 4a)
|
### New Attributes (Phase 4a)
|
||||||
|
|
||||||
```cpp
|
```cpp
|
||||||
// Round-level (on consensus.round) — ALL IMPLEMENTED
|
// Round-level (on consensus.round) — ALL IMPLEMENTED
|
||||||
"xrpl.consensus.round_id" = int64 // Consensus round number
|
"xrpl.consensus.round_id" = int64 // Consensus round number (kept — rule 5)
|
||||||
"xrpl.consensus.ledger_id" = string // previousLedger.id() hash
|
"xrpl.consensus.ledger_id" = string // previousLedger.id() hash (kept — rule 5)
|
||||||
"xrpl.consensus.trace_strategy" = string // "deterministic" or "attribute"
|
"trace_strategy" = string // "deterministic" or "attribute"
|
||||||
|
|
||||||
// Establish-level — IMPLEMENTED
|
// Establish-level — IMPLEMENTED
|
||||||
"xrpl.consensus.converge_percent" = int64 // Convergence % (0-100+)
|
"converge_percent" = int64 // Convergence % (0-100+)
|
||||||
"xrpl.consensus.establish_count" = int64 // Number of establish iterations
|
"establish_count" = int64 // Number of establish iterations
|
||||||
"xrpl.consensus.agree_count" = int64 // Peers that agree (haveConsensus)
|
"agree_count" = int64 // Peers that agree (haveConsensus)
|
||||||
"xrpl.consensus.disagree_count" = int64 // Peers that disagree
|
"disagree_count" = int64 // Peers that disagree
|
||||||
"xrpl.consensus.threshold_percent" = int64 // Current threshold (avCT_CONSENSUS_PCT = 75%)
|
"threshold_percent" = int64 // Current threshold (avCT_CONSENSUS_PCT = 75%)
|
||||||
"xrpl.consensus.result" = string // "yes", "no", "moved_on"
|
"consensus_result" = string // "yes", "no", "moved_on"
|
||||||
"xrpl.consensus.have_close_time_consensus" = bool // Close time consensus reached
|
"have_close_time_consensus" = bool // Close time consensus reached
|
||||||
"xrpl.consensus.close_time_threshold" = int64 // Close time voting threshold
|
"close_time_threshold" = int64 // Close time voting threshold
|
||||||
|
|
||||||
// Establish-level — IMPLEMENTED
|
// Establish-level — IMPLEMENTED
|
||||||
"xrpl.consensus.disputes_count" = int64 // Active disputes (on update_positions)
|
"disputes_count" = int64 // Active disputes (on update_positions)
|
||||||
"xrpl.consensus.avalanche_threshold" = int64 // Escalated weight (on update_positions)
|
"avalanche_threshold" = int64 // Escalated weight (on update_positions)
|
||||||
|
|
||||||
// Establish-level — NOT IMPLEMENTED
|
// Establish-level — NOT IMPLEMENTED
|
||||||
// "xrpl.consensus.proposers_agreed" = int64 // Peers agreeing with us — not set
|
// "proposers_agreed" = int64 // Peers agreeing with us — not set
|
||||||
// "xrpl.consensus.proposers_total" = int64 // Total peer positions — not set (not defined)
|
// "proposers_total" = int64 // Total peer positions — not set (not defined)
|
||||||
|
|
||||||
// Mode change — ALL IMPLEMENTED
|
// Mode change — ALL IMPLEMENTED
|
||||||
"xrpl.consensus.mode.old" = string // Previous mode
|
"mode_old" = string // Previous mode
|
||||||
"xrpl.consensus.mode.new" = string // New mode
|
"mode_new" = string // New mode
|
||||||
```
|
```
|
||||||
|
|
||||||
### Implementation Notes
|
### Implementation Notes
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ Tempo/Prometheus.
|
|||||||
- `rpc.command.server_info` spans (callMethod)
|
- `rpc.command.server_info` spans (callMethod)
|
||||||
- `rpc.command.server_state` spans (callMethod)
|
- `rpc.command.server_state` spans (callMethod)
|
||||||
- `rpc.command.ledger` spans (callMethod)
|
- `rpc.command.ledger` spans (callMethod)
|
||||||
- Verify `xrpl.rpc.command` attribute present on `rpc.command.*` spans
|
- Verify `command` attribute present on `rpc.command.*` spans
|
||||||
|
|
||||||
**Verification**:
|
**Verification**:
|
||||||
|
|
||||||
|
|||||||
@@ -31,10 +31,10 @@
|
|||||||
explicit:
|
explicit:
|
||||||
buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]
|
buckets: [1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 5s]
|
||||||
dimensions:
|
dimensions:
|
||||||
- name: xrpl.rpc.command
|
- name: command
|
||||||
- name: xrpl.rpc.status
|
- name: rpc_status
|
||||||
- name: xrpl.consensus.phase
|
- name: consensus_phase
|
||||||
- name: xrpl.tx.type
|
- name: tx_type
|
||||||
```
|
```
|
||||||
- Add `prometheus` exporter:
|
- Add `prometheus` exporter:
|
||||||
```yaml
|
```yaml
|
||||||
|
|||||||
@@ -50,7 +50,7 @@
|
|||||||
if (telemetry.isEnabled() && telemetry.shouldTraceRpc())
|
if (telemetry.isEnabled() && telemetry.shouldTraceRpc())
|
||||||
{
|
{
|
||||||
SpanGuard guard(telemetry.startSpan("rpc.command.submit"));
|
SpanGuard guard(telemetry.startSpan("rpc.command.submit"));
|
||||||
guard.setAttribute("xrpl.rpc.command", "submit");
|
guard.setAttribute("command", "submit");
|
||||||
// ... guard ends span automatically on scope exit
|
// ... guard ends span automatically on scope exit
|
||||||
}
|
}
|
||||||
@endcode
|
@endcode
|
||||||
|
|||||||
Reference in New Issue
Block a user