From 34ee231d626fb4f0013fa8f9934f4e530d1b918c Mon Sep 17 00:00:00 2001 From: Pratik Mankawde <3397372+pratikmankawde@users.noreply.github.com> Date: Fri, 24 Apr 2026 21:35:50 +0100 Subject: [PATCH] feat(telemetry): add Phase 4 consensus tracing with SpanGuard API Instrument the consensus subsystem with OpenTelemetry spans covering the full round lifecycle: round start, establish phase, proposal send, ledger close, position updates, consensus check, accept, validation send, and mode changes. Key design choices adapted from the original Phase 4 implementation to the new SpanGuard factory pattern introduced in Phase 3: - Add SpanGuard::hashSpan() for category-gated hash-derived trace IDs (consensus round spans share trace_id across validators via ledger hash) - Add SpanGuard::addEvent() overload with key-value attribute pairs (used for dispute.resolve events during position updates) - Add ConsensusSpanNames.h with compile-time span name constants following the colocated *SpanNames.h pattern from Phase 3 - Add consensusTraceStrategy config option ("deterministic"/"attribute") for cross-node trace correlation strategy selection - Use SpanGuard::linkedSpan() for follows-from relationships between consecutive rounds and cross-thread validation spans - Use SpanGuard::captureContext() for thread-safe context propagation from consensus thread to jtACCEPT worker thread Spans produced: consensus.round, consensus.proposal.send, consensus.ledger_close, consensus.establish, consensus.update_positions, consensus.check, consensus.accept, consensus.accept.apply, consensus.validation.send, consensus.mode_change Co-Authored-By: Claude Opus 4.6 (1M context) --- .../scripts/levelization/results/ordering.txt | 5 + OpenTelemetryPlan/02-design-decisions.md | 16 + OpenTelemetryPlan/06-implementation-phases.md | 74 ++ OpenTelemetryPlan/Phase4_taskList.md | 707 +++++++++++++++++- cspell.config.yaml | 1 + .../provisioning/datasources/tempo.yaml | 32 + include/xrpl/telemetry/SpanGuard.h | 19 + include/xrpl/telemetry/Telemetry.h | 11 + src/libxrpl/telemetry/NullTelemetry.cpp | 6 + src/libxrpl/telemetry/SpanGuard.cpp | 48 ++ src/libxrpl/telemetry/Telemetry.cpp | 12 + src/libxrpl/telemetry/TelemetryConfig.cpp | 3 + .../libxrpl/telemetry/SpanGuardFactory.cpp | 24 + src/xrpld/app/consensus/ConsensusSpanNames.h | 156 ++++ src/xrpld/app/consensus/RCLConsensus.cpp | 136 ++++ src/xrpld/app/consensus/RCLConsensus.h | 48 ++ src/xrpld/consensus/Consensus.h | 76 ++ src/xrpld/consensus/DisputedTx.h | 14 + 18 files changed, 1372 insertions(+), 16 deletions(-) create mode 100644 src/xrpld/app/consensus/ConsensusSpanNames.h diff --git a/.github/scripts/levelization/results/ordering.txt b/.github/scripts/levelization/results/ordering.txt index 3d540797d2..62b51b4a4f 100644 --- a/.github/scripts/levelization/results/ordering.txt +++ b/.github/scripts/levelization/results/ordering.txt @@ -101,6 +101,7 @@ test.core > xrpl.server test.csf > xrpl.basics test.csf > xrpld.consensus test.csf > xrpl.json +test.csf > xrpl.telemetry test.csf > xrpl.ledger test.csf > xrpl.protocol test.json > test.jtx @@ -194,6 +195,8 @@ tests.libxrpl > xrpl.json tests.libxrpl > xrpl.net tests.libxrpl > xrpl.protocol tests.libxrpl > xrpl.protocol_autogen +tests.libxrpl > xrpl.telemetry +tests.libxrpl > xrpld.telemetry xrpl.conditions > xrpl.basics xrpl.conditions > xrpl.protocol xrpl.core > xrpl.basics @@ -253,6 +256,8 @@ xrpld.consensus > xrpl.basics xrpld.consensus > xrpl.json xrpld.consensus > xrpl.ledger xrpld.consensus > xrpl.protocol +xrpld.consensus > xrpl.telemetry +xrpld.consensus > xrpld.telemetry xrpld.core > xrpl.basics xrpld.core > xrpl.core xrpld.core > xrpl.net diff --git a/OpenTelemetryPlan/02-design-decisions.md b/OpenTelemetryPlan/02-design-decisions.md index c0c5d2f5d7..9b0ef51db6 100644 --- a/OpenTelemetryPlan/02-design-decisions.md +++ b/OpenTelemetryPlan/02-design-decisions.md @@ -239,6 +239,22 @@ resource::SemanticConventions::SERVICE_INSTANCE_ID = "xrpl.consensus.ledger.seq" = int64 // Ledger sequence "xrpl.consensus.tx_count" = int64 // Transactions in consensus set "xrpl.consensus.duration_ms" = float64 // Round duration + +// Phase 4a: Establish-phase gap fill & cross-node correlation +"xrpl.consensus.round_id" = int64 // Consensus round number +"xrpl.consensus.ledger_id" = string // previousLedger.id() — shared across nodes +"xrpl.consensus.trace_strategy" = string // "deterministic" or "attribute" +"xrpl.consensus.converge_percent" = int64 // Convergence % (0-100+) +"xrpl.consensus.establish_count" = int64 // Number of establish iterations +"xrpl.consensus.disputes_count" = int64 // Active disputed transactions +"xrpl.consensus.proposers_agreed" = int64 // Peers agreeing with our position +"xrpl.consensus.proposers_total" = int64 // Total peer positions +"xrpl.consensus.agree_count" = int64 // Peers that agree (haveConsensus) +"xrpl.consensus.disagree_count" = int64 // Peers that disagree +"xrpl.consensus.threshold_percent" = int64 // Current threshold (50/65/70/95) +"xrpl.consensus.result" = string // "yes", "no", "moved_on" +"xrpl.consensus.mode.old" = string // Previous consensus mode +"xrpl.consensus.mode.new" = string // New consensus mode ``` #### RPC Attributes diff --git a/OpenTelemetryPlan/06-implementation-phases.md b/OpenTelemetryPlan/06-implementation-phases.md index c5c693d7a0..83a64a3cd1 100644 --- a/OpenTelemetryPlan/06-implementation-phases.md +++ b/OpenTelemetryPlan/06-implementation-phases.md @@ -176,11 +176,22 @@ and [Phase3_taskList.md Task 3.9](./Phase3_taskList.md) for the full implementat | 4.10 | Multi-validator integration tests | | 4.11 | Performance validation | +### Spans Produced + +| Span Name | Location | Attributes | +| --------------------------- | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `consensus.proposal.send` | `RCLConsensus.cpp:177` | `xrpl.consensus.round` | +| `consensus.ledger_close` | `RCLConsensus.cpp:282` | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | +| `consensus.accept` | `RCLConsensus.cpp:395` | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | +| `consensus.accept.apply` | `RCLConsensus.cpp:521` | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state`, `proposing`, `round_time_ms`, `ledger.seq`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction` | +| `consensus.validation.send` | `RCLConsensus.cpp:753` | `xrpl.consensus.proposing` | + ### Exit Criteria - [x] Complete consensus round traces - [x] Phase transitions visible - [x] Proposals and validations traced +- [x] Close time agreement tracked (per `avCT_CONSENSUS_PCT`) - [x] No impact on consensus timing - [ ] Multi-validator test network validated @@ -208,6 +219,69 @@ See [Phase4_taskList.md](./Phase4_taskList.md) for the full spec and implementat --- +## 6.5a Phase 4a: Establish-Phase Gap Fill & Cross-Node Correlation + +**Objective**: Fill tracing gaps in the establish phase and establish cross-node +correlation using deterministic trace IDs derived from `previousLedger.id()`. + +**Approach**: Direct instrumentation in `Consensus.h`. Long-lived spans use +direct SpanGuard members; short-lived scoped spans use `XRPL_TRACE_*` macros. + +### Tasks + +| Task | Description | Effort | Risk | +| ---- | ------------------------------------------------ | ------ | ------ | +| 4a.0 | Prerequisites: extend SpanGuard & Telemetry APIs | 1d | Medium | +| 4a.1 | Adaptor `getTelemetry()` method | 0.5d | Low | +| 4a.2 | Switchable round span with deterministic traceID | 2d | High | +| 4a.3 | Span members in `Consensus.h` | 0.5d | Medium | +| 4a.4 | Instrument `phaseEstablish()` | 1d | Medium | +| 4a.5 | Instrument `updateOurPositions()` | 1d | Medium | +| 4a.6 | Instrument `haveConsensus()` (thresholds) | 1d | Medium | +| 4a.7 | Instrument mode changes | 0.5d | Low | +| 4a.8 | Reparent existing spans under round | 0.5d | Low | +| 4a.9 | Build verification and testing | 1d | Low | + +**Total Effort**: 9 days + +### Spans Produced + +| Span Name | Location | Key Attributes | +| ---------------------------- | ------------------ | ---------------------------------------------------------------- | +| `consensus.round` | `RCLConsensus.cpp` | `round_id`, `ledger_id`, `ledger.seq`, `mode`; link → prev round | +| `consensus.establish` | `Consensus.h` | `converge_percent`, `establish_count`, `proposers` | +| `consensus.update_positions` | `Consensus.h` | `disputes_count`, `converge_percent`, `proposers_agreed/total` | +| `consensus.check` | `Consensus.h` | `agree/disagree_count`, `threshold_percent`, `result` | +| `consensus.mode_change` | `RCLConsensus.cpp` | `mode.old`, `mode.new` | + +### Exit Criteria + +- [ ] Establish phase internals fully traced (disputes, convergence, thresholds) +- [ ] Cross-node correlation works via deterministic trace_id +- [ ] Strategy switchable via config (`deterministic` / `attribute`) +- [ ] Consecutive rounds linked via follows-from spans +- [ ] Build passes with telemetry ON and OFF +- [ ] No impact on consensus timing + +See [Phase4_taskList.md](./Phase4_taskList.md) for full task details. + +--- + +## 6.5b Phase 4b: Cross-Node Propagation (Future) + +**Objective**: Wire `TraceContextPropagator` for P2P messages (proposals, +validations) to enable true distributed tracing between nodes. + +**Status**: Design documented, NOT implemented. Protobuf fields (field 1001) +and `TraceContextPropagator` class exist. Wiring deferred until Phase 4a is +validated in a multi-node environment. + +**Prerequisites**: Phase 4a complete and validated. + +See [Phase4_taskList.md § Phase 4b](./Phase4_taskList.md) for full design. + +--- + ## 6.6 Phase 5: Documentation & Deployment (Week 9) **Objective**: Production readiness diff --git a/OpenTelemetryPlan/Phase4_taskList.md b/OpenTelemetryPlan/Phase4_taskList.md index 7a44d23e0c..3817183a22 100644 --- a/OpenTelemetryPlan/Phase4_taskList.md +++ b/OpenTelemetryPlan/Phase4_taskList.md @@ -25,7 +25,7 @@ - Edit `src/xrpld/app/consensus/RCLConsensus.cpp`: - In `RCLConsensus::startRound()` (or the Adaptor's startRound): - - Create `consensus.round` span using `SpanGuard::span(TraceCategory::Consensus, ...)` + - Create `consensus.round` span using `XRPL_TRACE_CONSENSUS` macro - Set attributes: - `xrpl.consensus.ledger.prev` — previous ledger hash - `xrpl.consensus.ledger.seq` — target ledger sequence @@ -67,7 +67,7 @@ - Create `consensus.ledger_close` span - Set attributes: close_time, mode, transaction count in initial position - - Note: The Consensus template class in `include/xrpl/consensus/Consensus.h` drives phase transitions — check if instrumentation goes there or in the Adaptor + - Note: The Consensus template class in `src/xrpld/consensus/Consensus.h` drives phase transitions — Phase 4a instruments directly in the template **Key modified files**: @@ -199,23 +199,698 @@ --- +## Task 4.8: Consensus Validation Span Enrichment — External Dashboard Parity + +> **Source**: [External Dashboard Parity](../docs/superpowers/specs/2026-03-30-external-dashboard-parity-design.md) — adds validation agreement context inspired by the community [xrpl-validator-dashboard](https://github.com/realgrapedrop/xrpl-validator-dashboard). +> +> **Upstream**: Phase 4 tasks 4.1-4.4 (span creation must exist). +> **Downstream**: Phase 7 (ValidationTracker reads these attributes), Phase 10 (validation checks). + +**Objective**: Add ledger hash, validation type, and quorum data to consensus validation spans on both send and receive paths. This enables trace-level validation agreement analysis — filter by ledger hash to see which validators agreed for a given ledger. + +**What to do**: + +- Edit `src/xrpld/app/consensus/RCLConsensus.cpp`: + - On the `consensus.validation.send` span (in `validate()` / `doAccept()`): + - Add `xrpl.validation.ledger_hash` (string) — the ledger hash being validated + - Add `xrpl.validation.full` (bool) — whether this is a full validation (not partial) + - On the `consensus.accept` span (in `onAccept()`): + - Add `xrpl.consensus.validation_quorum` (int64) — from `app_.validators().quorum()` + - Add `xrpl.consensus.proposers_validated` (int64) — from `result.proposers` + +- Edit `src/xrpld/overlay/detail/PeerImp.cpp`: + - On the `peer.validation.receive` span: + - Add `xrpl.peer.validation.ledger_hash` (string) — from deserialized `STValidation` object + - Add `xrpl.peer.validation.full` (bool) — from `STValidation` flags + +**New span attributes**: + +| Span | Attribute | Type | Source | +| --------------------------- | ------------------------------------ | ------ | --------------------------------- | +| `consensus.validation.send` | `xrpl.validation.ledger_hash` | string | Ledger hash from validate() args | +| `consensus.validation.send` | `xrpl.validation.full` | bool | Full vs partial validation | +| `peer.validation.receive` | `xrpl.peer.validation.ledger_hash` | string | From STValidation deserialization | +| `peer.validation.receive` | `xrpl.peer.validation.full` | bool | From STValidation flags | +| `consensus.accept` | `xrpl.consensus.validation_quorum` | int64 | `app_.validators().quorum()` | +| `consensus.accept` | `xrpl.consensus.proposers_validated` | int64 | `result.proposers` | + +**Rationale**: The external dashboard's most valuable feature is validation agreement tracking. By recording the ledger hash on both outgoing and incoming validation spans, we create the raw data for agreement analysis at the trace level. Example Tempo query: + +``` +{name="consensus.validation.send"} | xrpl.validation.ledger_hash = "A1B2C3..." +``` + +Phase 7's `ValidationTracker` builds metric-level aggregation (1h/24h agreement %) on top of this data. + +**Key modified files**: + +- `src/xrpld/app/consensus/RCLConsensus.cpp` +- `src/xrpld/overlay/detail/PeerImp.cpp` + +**Exit Criteria**: + +- [ ] `consensus.validation.send` spans carry `xrpl.validation.ledger_hash` and `xrpl.validation.full` +- [ ] `peer.validation.receive` spans carry `xrpl.peer.validation.ledger_hash` and `xrpl.peer.validation.full` +- [ ] `consensus.accept` spans carry `xrpl.consensus.validation_quorum` and `xrpl.consensus.proposers_validated` +- [ ] Ledger hash attributes match between send and receive for the same ledger +- [ ] No impact on consensus performance + +--- + ## Summary -| Task | Description | New Files | Modified Files | Depends On | -| ---- | ------------------------------------- | --------- | -------------- | ------------- | -| 4.1 | Consensus round start instrumentation | 0 | 2 | Phase 3 | -| 4.2 | Phase transition instrumentation | 0 | 1-2 | 4.1 | -| 4.3 | Proposal handling instrumentation | 0 | 1 | 4.1 | -| 4.4 | Validation handling instrumentation | 0 | 1-2 | 4.1 | -| 4.5 | Consensus-specific attributes | 0 | 1 | 4.2, 4.3, 4.4 | -| 4.6 | Transaction-consensus correlation | 0 | 2 | 4.2, Phase 3 | -| 4.7 | Build verification and testing | 0 | 0 | 4.1-4.6 | +| Task | Description | New Files | Modified Files | Depends On | +| ---- | ------------------------------------------- | --------- | -------------- | ------------- | +| 4.1 | Consensus round start instrumentation | 0 | 2 | Phase 3 | +| 4.2 | Phase transition instrumentation | 0 | 1-2 | 4.1 | +| 4.3 | Proposal handling instrumentation | 0 | 1 | 4.1 | +| 4.4 | Validation handling instrumentation | 0 | 1-2 | 4.1 | +| 4.5 | Consensus-specific attributes | 0 | 1 | 4.2, 4.3, 4.4 | +| 4.6 | Transaction-consensus correlation | 0 | 2 | 4.2, Phase 3 | +| 4.7 | Build verification and testing | 0 | 0 | 4.1-4.6 | +| 4.8 | Validation span enrichment (ext. dashboard) | 0 | 2 | 4.4 | -**Parallel work**: Tasks 4.2, 4.3, and 4.4 can run in parallel after 4.1 is complete. Task 4.5 depends on all three. Task 4.6 depends on 4.2 and Phase 3. +**Parallel work**: Tasks 4.2, 4.3, and 4.4 can run in parallel after 4.1 is complete. Task 4.5 depends on all three. Task 4.6 depends on 4.2 and Phase 3. Task 4.8 depends on 4.4 (validation spans must exist). + +### Implemented Spans + +| Span Name | Method | Key Attributes | +| --------------------------- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `consensus.proposal.send` | `Adaptor::propose` | `xrpl.consensus.round` | +| `consensus.ledger_close` | `Adaptor::onClose` | `xrpl.consensus.ledger.seq`, `xrpl.consensus.mode` | +| `consensus.accept` | `Adaptor::onAccept` | `xrpl.consensus.proposers`, `xrpl.consensus.round_time_ms` | +| `consensus.accept.apply` | `Adaptor::doAccept` | `xrpl.consensus.close_time`, `close_time_correct`, `close_resolution_ms`, `state`, `proposing`, `round_time_ms`, `ledger.seq`, `parent_close_time`, `close_time_self`, `close_time_vote_bins`, `resolution_direction` | +| `consensus.validation.send` | `Adaptor::onAccept` (via validate) | `xrpl.consensus.proposing` | + +#### Close Time Attributes (consensus.accept.apply) + +The `consensus.accept.apply` span captures ledger close time agreement details +driven by `avCT_CONSENSUS_PCT` (75% validator agreement threshold): + +- **`xrpl.consensus.close_time`** — Agreed-upon ledger close time (epoch seconds). When validators disagree (`consensusCloseTime == epoch`), this is synthetically set to `prevCloseTime + 1s`. +- **`xrpl.consensus.close_time_correct`** — `true` if validators reached agreement, `false` if they "agreed to disagree" (close time forced to prev+1s). +- **`xrpl.consensus.close_resolution_ms`** — Rounding granularity for close time (starts at 30s, decreases as ledger interval stabilizes). +- **`xrpl.consensus.state`** — `"finished"` (normal) or `"moved_on"` (consensus failed, adopted best available). +- **`xrpl.consensus.proposing`** — Whether this node was proposing. +- **`xrpl.consensus.round_time_ms`** — Total consensus round duration. +- **`xrpl.consensus.parent_close_time`** — Previous ledger's close time (epoch seconds). Enables computing close-time deltas across consecutive rounds without correlating separate spans. +- **`xrpl.consensus.close_time_self`** — This node's own proposed close time before consensus voting. +- **`xrpl.consensus.close_time_vote_bins`** — Number of distinct close-time vote bins from peer proposals. Higher values indicate less agreement among validators. +- **`xrpl.consensus.resolution_direction`** — Whether close-time resolution `"increased"` (coarser), `"decreased"` (finer), or stayed `"unchanged"` relative to the previous ledger. **Exit Criteria** (from [06-implementation-phases.md §6.11.4](./06-implementation-phases.md)): -- [ ] Complete consensus round traces -- [ ] Phase transitions visible -- [ ] Proposals and validations traced -- [ ] No impact on consensus timing +- [x] Complete consensus round traces +- [x] Phase transitions visible +- [x] Proposals and validations traced +- [x] Close time agreement tracked (per `avCT_CONSENSUS_PCT`) +- [x] No impact on consensus timing + +--- + +# Phase 4a: Establish-Phase Gap Fill & Cross-Node Correlation + +> **Goal**: Fill tracing gaps in the consensus establish phase (disputes, convergence, +> threshold escalation, mode changes) and establish cross-node correlation using a +> deterministic shared trace ID derived from `previousLedger.id()`. +> +> **Approach**: Direct instrumentation in `Consensus.h` — the generic consensus +> template has full access to internal state (`convergePercent_`, `result_->disputes`, +> `mode_`, threshold logic). Telemetry access comes via a single new adaptor +> method `getTelemetry()`. Long-lived spans (round, establish) are stored as +> class members using `SpanGuard` directly — NOT the `XRPL_TRACE_*` convenience +> macros (which create local variables named `_xrpl_guard_`). Short-lived +> scoped spans (update_positions, check) can use the macros. All code compiles +> to no-ops when `XRPL_ENABLE_TELEMETRY` is not defined. +> +> **Branch**: `pratik/otel-phase4-consensus-tracing` + +## Design: Switchable Correlation Strategy + +Two strategies for cross-node trace correlation, switchable via config: + +### Strategy A — Deterministic Trace ID (Default) + +Derive `trace_id = SHA256(previousLedger.id())[0:16]` so all nodes in the same +consensus round share the same trace_id without P2P context propagation. + +- **Pros**: All nodes appear in the same trace in Tempo/Jaeger automatically. + No collector-side post-processing needed. +- **Cons**: Overrides OTel's random trace_id generation; requires custom + `IdGenerator` or manual span context construction. + +### Strategy B — Attribute-Based Correlation + +Use normal random trace_id but attach `xrpl.consensus.ledger_id` as an attribute +on every consensus span. Correlation happens at query time via Tempo/Grafana +`by attribute` queries. + +- **Pros**: Standard OTel trace_id semantics; no SDK customization. +- **Cons**: Cross-node correlation requires query-time joins, not automatic. + +### Config + +```ini +[telemetry] +# "deterministic" (default) or "attribute" +consensus_trace_strategy=deterministic +``` + +### Implementation + +In `RCLConsensus::Adaptor::startRound()`: + +- If `deterministic`: + 1. Compute `trace_id_bytes = SHA256(prevLedgerID)[0:16]` + 2. Construct `opentelemetry::trace::TraceId(trace_id_bytes)` + 3. Create a synthetic `SpanContext` with this trace_id and a random span_id: + ```cpp + auto traceId = opentelemetry::trace::TraceId(trace_id_bytes); + auto spanId = opentelemetry::trace::SpanId(random_8_bytes); + auto syntheticCtx = opentelemetry::trace::SpanContext( + traceId, spanId, opentelemetry::trace::TraceFlags(1), false); + ``` + 4. Wrap in `opentelemetry::context::Context` via + `opentelemetry::trace::SetSpan(context, syntheticSpan)` + 5. Call `startSpan("consensus.round", parentContext)` so the new span + inherits the deterministic trace_id. +- If `attribute`: start a normal `consensus.round` span, set + `xrpl.consensus.ledger_id = previousLedger.id()` as attribute. + +Both strategies always set `xrpl.consensus.round_id` (round number) and +`xrpl.consensus.ledger_id` (previous ledger hash) as attributes. + +--- + +## Design: Span Hierarchy + +``` +consensus.round (root — created in RCLConsensus::startRound, closed at accept) +│ link → previous round's SpanContext (follows-from) +│ +├── consensus.establish (phaseEstablish → acceptance, in Consensus.h) +│ ├── consensus.update_positions (each updateOurPositions call) +│ │ └── consensus.dispute.resolve (per-tx dispute resolution event) +│ ├── consensus.check (each haveConsensus call) +│ └── consensus.mode_change (short-lived span in adaptor on mode transition) +│ +├── consensus.accept (existing onAccept span — reparented under round) +│ +└── consensus.validation.send (existing — reparented, follows-from link to round) +``` + +### Span Links (follows-from relationships) + +| Link Source | Link Target | Rationale | +| ----------------------------------------- | -------------------------- | ------------------------------------------------------------------------------ | +| `consensus.round` (N+1) | `consensus.round` (N) | Causal chain: round N+1 exists because round N accepted | +| `consensus.validation.send` | `consensus.round` | Validation follows from the round that produced it; may outlive the round span | +| _(Phase 4b)_ Received proposal processing | Sender's `consensus.round` | Cross-node causal link via P2P context propagation | + +--- + +## Task 4a.0: Prerequisites — Extend SpanGuard and Telemetry APIs + +**Objective**: Add missing API surface needed by later tasks. + +**What to do**: + +1. **Add `SpanGuard::addEvent()` with attributes** (needed by Task 4a.5): + The current `addEvent(string_view name)` only accepts a name. Add an + overload that accepts key-value attributes: + + ```cpp + void addEvent(std::string_view name, + std::initializer_list< + std::pair> attributes) + { + span_->AddEvent(std::string(name), attributes); + } + ``` + +2. **Add a `Telemetry::startSpan()` overload that accepts span links** (needed by Tasks 4a.2, 4a.8): + The current `startSpan()` has no span link support. Add an overload that + accepts a vector of `SpanContext` links for follows-from relationships: + + ```cpp + virtual opentelemetry::nostd::shared_ptr + startSpan( + std::string_view name, + opentelemetry::context::Context const& parentContext, + std::vector const& links, + opentelemetry::trace::SpanKind kind = opentelemetry::trace::SpanKind::kInternal) = 0; + ``` + +3. **Add `XRPL_TRACE_ADD_EVENT` macro** (needed by Task 4a.5): + Add to `TracingInstrumentation.h` to expose `addEvent(name, attrs)` through + the macro interface (consistent with `XRPL_TRACE_SET_ATTR` pattern): + ```cpp + #ifdef XRPL_ENABLE_TELEMETRY + #define XRPL_TRACE_ADD_EVENT(name, ...) \ + if (_xrpl_guard_.has_value()) \ + { \ + _xrpl_guard_->addEvent(name, __VA_ARGS__); \ + } + #else + #define XRPL_TRACE_ADD_EVENT(name, ...) ((void)0) + #endif + ``` + +**Key modified files**: + +- `include/xrpl/telemetry/SpanGuard.h` — add `addEvent()` overload +- `include/xrpl/telemetry/Telemetry.h` — add `startSpan()` with links +- `src/xrpld/telemetry/Telemetry.cpp` — implement new overload +- `src/xrpld/telemetry/NullTelemetry.cpp` — no-op implementation +- `src/xrpld/telemetry/TracingInstrumentation.h` — add `XRPL_TRACE_ADD_EVENT` macro + +--- + +## Task 4a.1: Adaptor `getTelemetry()` Method + +**Objective**: Give `Consensus.h` access to the telemetry subsystem without +coupling the generic template to OTel headers. + +**What to do**: + +- Add `getTelemetry()` method to the Adaptor concept (returns + `xrpl::telemetry::Telemetry&`). The return type is already forward-declared + behind `#ifdef XRPL_ENABLE_TELEMETRY`. +- Implement in `RCLConsensus::Adaptor` — delegates to `app_.getTelemetry()`. +- In `Consensus.h`, the `XRPL_TRACE_*` macros call + `adaptor_.getTelemetry()` — when telemetry is disabled, the macros expand to + `((void)0)` and the method is never called. + +**Key modified files**: + +- `src/xrpld/app/consensus/RCLConsensus.h` — declare `getTelemetry()` +- `src/xrpld/app/consensus/RCLConsensus.cpp` — implement `getTelemetry()` + +--- + +## Task 4a.2: Switchable Round Span with Deterministic Trace ID + +**Objective**: Create a `consensus.round` root span in `startRound()` that uses +the switchable correlation strategy. Store span context as a member for child +spans in `Consensus.h`. + +**What to do**: + +- In `RCLConsensus::Adaptor::startRound()` (or a new helper): + - Read `consensus_trace_strategy` from config. + - **Deterministic**: compute `trace_id = SHA256(prevLedgerID)[0:16]`. + Construct a `SpanContext` with this trace_id, then start + `consensus.round` span as child of that context. + - **Attribute**: start normal `consensus.round` span. + - Set attributes on both: `xrpl.consensus.round_id`, + `xrpl.consensus.ledger_id`, `xrpl.consensus.ledger.seq`, + `xrpl.consensus.mode`. + - Store the round span in `Consensus` as a member (see Task 4a.3). + - If a previous round's span context is available, add a **span link** + (follows-from) to establish the round chain. + +- Add `createDeterministicTraceId(hash)` utility to + `include/xrpl/telemetry/Telemetry.h` (returns 16-byte trace ID from a + 256-bit hash by truncation). + +- Add `consensus_trace_strategy` to `Telemetry::Setup` and + `TelemetryConfig.cpp` parser: + ```cpp + /** Cross-node correlation strategy: "deterministic" or "attribute". */ + std::string consensusTraceStrategy = "deterministic"; + ``` + +**Key modified files**: + +- `src/xrpld/app/consensus/RCLConsensus.cpp` +- `include/xrpl/telemetry/Telemetry.h` — `createDeterministicTraceId()` +- `src/xrpld/telemetry/TelemetryConfig.cpp` — parse new config option + +--- + +## Task 4a.3: Span Members in `Consensus.h` + +**Objective**: Add span storage to the `Consensus` class so that spans created +in `startRound()` (adaptor) are accessible from `phaseEstablish()`, +`updateOurPositions()`, and `haveConsensus()` (template methods). + +**What to do**: + +- Add to `Consensus` private members (guarded by `#ifdef XRPL_ENABLE_TELEMETRY`): + ```cpp + #ifdef XRPL_ENABLE_TELEMETRY + std::optional roundSpan_; + std::optional establishSpan_; + opentelemetry::context::Context prevRoundContext_; + #endif + ``` +- `roundSpan_` is created in `startRound()` via the adaptor and stored. + Its `SpanGuard::Scope` member keeps the span active on the thread context + for the entire round lifetime. +- `establishSpan_` is created when entering phaseEstablish and cleared on accept. + It becomes a child of `roundSpan_` via OTel's thread-local context propagation. +- `prevRoundContext_` stores the previous round's context for follows-from links. + +**Threading assumption**: `startRound()`, `phaseEstablish()`, `updateOurPositions()`, +and `haveConsensus()` all run on the same thread (the consensus job queue thread). +This is required for the `SpanGuard::Scope`-based parent-child hierarchy to work. +The `Consensus` class documentation confirms it is NOT thread-safe and calls are +serialized by the application. + +- Add conditional include at top of `Consensus.h`: + ```cpp + #ifdef XRPL_ENABLE_TELEMETRY + #include + #include + #endif + ``` + +**Key modified files**: + +- `src/xrpld/consensus/Consensus.h` + +--- + +## Task 4a.4: Instrument `phaseEstablish()` + +**Objective**: Create `consensus.establish` span wrapping the establish phase, +with attributes for convergence progress. + +**What to do**: + +- At the start of `phaseEstablish()` (line 1298), if `establishSpan_` is not + yet created, create it as child of `roundSpan_` using the **direct API** + (NOT the `XRPL_TRACE_CONSENSUS` macro, which creates a local variable): + + ```cpp + #ifdef XRPL_ENABLE_TELEMETRY + if (!establishSpan_ && adaptor_.getTelemetry().shouldTraceConsensus()) + { + establishSpan_.emplace( + adaptor_.getTelemetry().startSpan("consensus.establish")); + } + #endif + ``` + +- Set attributes on each call: + - `xrpl.consensus.converge_percent` — `convergePercent_` + - `xrpl.consensus.establish_count` — `establishCounter_` + - `xrpl.consensus.proposers` — `currPeerPositions_.size()` + +- On phase exit (transition to accept), close the establish span and record + final duration. + +**Key modified files**: + +- `src/xrpld/consensus/Consensus.h` — `phaseEstablish()` method + +--- + +## Task 4a.5: Instrument `updateOurPositions()` + +**Objective**: Trace each position update cycle including dispute resolution +details. + +**What to do**: + +- At the start of `updateOurPositions()` (line 1418), create a scoped child + span. This method is called and returns within a single `phaseEstablish()` + call, so the `XRPL_TRACE_CONSENSUS` macro works here (scoped local): + + ```cpp + XRPL_TRACE_CONSENSUS(adaptor_.getTelemetry(), "consensus.update_positions"); + ``` + +- Set attributes: + - `xrpl.consensus.disputes_count` — `result_->disputes.size()` + - `xrpl.consensus.converge_percent` — current convergence + - `xrpl.consensus.proposers_agreed` — count of peers with same position + - `xrpl.consensus.proposers_total` — total peer positions + +- Inside the dispute resolution loop, for each dispute that changes our vote, + add an **event** with attributes using `XRPL_TRACE_ADD_EVENT` (from Task 4a.0): + ```cpp + XRPL_TRACE_ADD_EVENT("dispute.resolve", { + {"xrpl.tx.id", std::string(tx_id)}, + {"xrpl.dispute.our_vote", our_vote}, + {"xrpl.dispute.yays", static_cast(yays)}, + {"xrpl.dispute.nays", static_cast(nays)} + }); + ``` + +**Key modified files**: + +- `src/xrpld/consensus/Consensus.h` — `updateOurPositions()` method + +--- + +## Task 4a.6: Instrument `haveConsensus()` (Threshold & Convergence) + +**Objective**: Trace consensus checking including threshold escalation +(`ConsensusParms::AvalancheState::{init, mid, late, stuck}`). + +**What to do**: + +- At the start of `haveConsensus()` (line 1598), create a scoped child span: + + ```cpp + XRPL_TRACE_CONSENSUS(adaptor_.getTelemetry(), "consensus.check"); + ``` + +- Set attributes: + - `xrpl.consensus.agree_count` — peers that agree with our position + - `xrpl.consensus.disagree_count` — peers that disagree + - `xrpl.consensus.converge_percent` — convergence percentage + - `xrpl.consensus.result` — ConsensusState result (Yes/No/MovedOn) + +- The free function `checkConsensus()` in `Consensus.cpp` (line 151) determines + thresholds based on `currentAgreeTime`. Threshold values come from + `ConsensusParms::avalancheCutoffs` (defined in `ConsensusParms.h`). + The escalation states are `ConsensusParms::AvalancheState::{init, mid, late, stuck}`. + Record the effective threshold as an attribute on the span: + - `xrpl.consensus.threshold_percent` — current threshold from `avalancheCutoffs` + +**Key modified files**: + +- `src/xrpld/consensus/Consensus.h` — `haveConsensus()` method + +--- + +## Task 4a.7: Instrument Mode Changes + +**Objective**: Trace consensus mode transitions (proposing ↔ observing, +wrongLedger, switchedLedger). + +**What to do**: + +Mode changes are rare (typically 0-1 per round), so a **standalone short-lived +span** is appropriate (not an event). This captures timing of the mode change +itself. + +- In `RCLConsensus::Adaptor::onModeChange()`, create a scoped span: + + ```cpp + XRPL_TRACE_CONSENSUS(app_.getTelemetry(), "consensus.mode_change"); + XRPL_TRACE_SET_ATTR("xrpl.consensus.mode.old", to_string(before).c_str()); + XRPL_TRACE_SET_ATTR("xrpl.consensus.mode.new", to_string(after).c_str()); + ``` + +- Note: `MonitoredMode::set()` (line 304 in `Consensus.h`) calls + `adaptor_.onModeChange(before, after)` — so the span is created in the + adaptor, which already has telemetry access. No instrumentation needed + in `Consensus.h` for this task. + +**Key modified files**: + +- `src/xrpld/app/consensus/RCLConsensus.cpp` — `onModeChange()` + +--- + +## Task 4a.8: Reparent Existing Spans Under Round + +**Objective**: Make existing consensus spans (`consensus.accept`, +`consensus.accept.apply`, `consensus.validation.send`) children of the +`consensus.round` root span instead of being standalone. + +**What to do**: + +- The existing spans in `onAccept()`, `doAccept()`, and `validate()` use + `XRPL_TRACE_CONSENSUS(app_.getTelemetry(), ...)` which creates standalone + spans on the current thread's context. +- After Task 4a.2 creates the round span and stores it, these methods run on + the same thread within the round span's scope, so they automatically become + children. Verify this works correctly. +- For `consensus.validation.send`: add a **span link** (follows-from) to the + round span context, since the validation may be processed after the round + completes. + +**Key modified files**: + +- `src/xrpld/app/consensus/RCLConsensus.cpp` — verify parent-child hierarchy + +--- + +## Task 4a.9: Build Verification and Testing + +**Objective**: Verify all Phase 4a changes compile cleanly with telemetry ON +and OFF, and don't affect consensus timing. + +**What to do**: + +1. Build with `telemetry=ON` — verify no compilation errors +2. Build with `telemetry=OFF` — verify macros expand to no-ops, no new includes + leak into `Consensus.h` when disabled +3. Run existing consensus unit tests +4. Verify `#ifdef XRPL_ENABLE_TELEMETRY` guards on all new members in + `Consensus.h` +5. Run `pccl` pre-commit checks + +**Verification Checklist**: + +- [x] Build succeeds with telemetry ON +- [x] Build succeeds with telemetry OFF +- [x] Existing consensus tests pass +- [x] `Consensus.h` has zero OTel includes when telemetry is OFF +- [x] No new virtual calls in hot consensus paths +- [x] `pccl` passes + +--- + +## Phase 4a Summary + +| Task | Description | New Files | Modified Files | Depends On | +| ---- | ------------------------------------------------ | --------- | -------------- | ---------- | +| 4a.0 | Prerequisites: extend SpanGuard & Telemetry APIs | 0 | 4 | Phase 4 | +| 4a.1 | Adaptor `getTelemetry()` method | 0 | 2 | Phase 4 | +| 4a.2 | Switchable round span with deterministic traceID | 0 | 3 | 4a.0, 4a.1 | +| 4a.3 | Span members in `Consensus.h` | 0 | 1 | 4a.1 | +| 4a.4 | Instrument `phaseEstablish()` | 0 | 1 | 4a.3 | +| 4a.5 | Instrument `updateOurPositions()` | 0 | 1 | 4a.0, 4a.3 | +| 4a.6 | Instrument `haveConsensus()` (thresholds) | 0 | 1 | 4a.3 | +| 4a.7 | Instrument mode changes | 0 | 1 | 4a.1 | +| 4a.8 | Reparent existing spans under round | 0 | 1 | 4a.0, 4a.2 | +| 4a.9 | Build verification and testing | 0 | 0 | 4a.0-4a.8 | + +**Parallel work**: Tasks 4a.0 and 4a.1 can run in parallel. Tasks 4a.4, 4a.5, 4a.6, and 4a.7 can run in parallel after 4a.3 (and 4a.0 for 4a.5). + +### New Spans (Phase 4a) + +| Span Name | Location | Key Attributes | +| ---------------------------- | ------------------ | ---------------------------------------------------------------------------------- | +| `consensus.round` | `RCLConsensus.cpp` | `round_id`, `ledger_id`, `ledger.seq`, `mode`; link → prev round | +| `consensus.establish` | `Consensus.h` | `converge_percent`, `establish_count`, `proposers` | +| `consensus.update_positions` | `Consensus.h` | `disputes_count`, `converge_percent`, `proposers_agreed`, `proposers_total` | +| `consensus.check` | `Consensus.h` | `agree_count`, `disagree_count`, `converge_percent`, `result`, `threshold_percent` | +| `consensus.mode_change` | `RCLConsensus.cpp` | `mode.old`, `mode.new` | + +### New Events (Phase 4a) + +| Event Name | Parent Span | Attributes | +| ----------------- | ---------------------------- | ----------------------------------- | +| `dispute.resolve` | `consensus.update_positions` | `tx_id`, `our_vote`, `yays`, `nays` | + +### New Attributes (Phase 4a) + +```cpp +// Round-level (on consensus.round) +"xrpl.consensus.round_id" = int64 // Consensus round number +"xrpl.consensus.ledger_id" = string // previousLedger.id() hash +"xrpl.consensus.trace_strategy" = string // "deterministic" or "attribute" + +// Establish-level +"xrpl.consensus.converge_percent" = int64 // Convergence % (0-100+) +"xrpl.consensus.establish_count" = int64 // Number of establish iterations +"xrpl.consensus.disputes_count" = int64 // Active disputes +"xrpl.consensus.proposers_agreed" = int64 // Peers agreeing with us +"xrpl.consensus.proposers_total" = int64 // Total peer positions +"xrpl.consensus.agree_count" = int64 // Peers that agree (haveConsensus) +"xrpl.consensus.disagree_count" = int64 // Peers that disagree +"xrpl.consensus.threshold_percent" = int64 // Current threshold (50/65/70/95) +"xrpl.consensus.result" = string // "yes", "no", "moved_on" + +// Mode change +"xrpl.consensus.mode.old" = string // Previous mode +"xrpl.consensus.mode.new" = string // New mode +``` + +### Implementation Notes + +- **Separation of concerns**: All non-trivial telemetry code extracted to private + helpers (`startRoundTracing`, `createValidationSpan`, `startEstablishTracing`, + `updateEstablishTracing`, `endEstablishTracing`). Business logic methods contain + only single-line `#ifdef` blocks calling these helpers. +- **Thread safety**: `createValidationSpan()` runs on the jtACCEPT worker thread. + Instead of accessing `roundSpan_` across threads, a `roundSpanContext_` snapshot + (lightweight `SpanContext` value type) is captured on the consensus thread in + `startRoundTracing()` and read by `createValidationSpan()`. The job queue + provides the happens-before guarantee. +- **Macro safety**: `XRPL_TRACE_ADD_EVENT` uses `do { } while (0)` to prevent + dangling-else issues. +- **Config validation**: `consensus_trace_strategy` is validated to be either + `"deterministic"` or `"attribute"`, falling back to `"deterministic"` for + unrecognised values. +- **Plan deviation**: `roundSpan_` is stored in `RCLConsensus::Adaptor` (not + `Consensus.h`) because the adaptor has access to telemetry config and can + implement the deterministic trace ID strategy. `establishSpan_` is correctly + in `Consensus.h` as planned. + +--- + +# Phase 4b: Cross-Node Propagation (Future — Documentation Only) + +> **Goal**: Wire `TraceContextPropagator` for P2P messages so that proposals +> and validations carry trace context between nodes. This enables true +> distributed tracing where a proposal sent by Node A creates a child span +> on Node B. +> +> **Status**: NOT IMPLEMENTED. The protobuf fields and propagator class exist +> but are not wired. This section documents the design for future work. + +## Architecture + +``` +Node A (proposing) Node B (receiving) +───────────────── ────────────────── +consensus.round consensus.round +├── propose() ├── peerProposal() +│ └── TraceContextPropagator │ └── TraceContextPropagator +│ ::injectToProtobuf( │ ::extractFromProtobuf( +│ TMProposeSet.trace_context) │ TMProposeSet.trace_context) +│ │ └── span link → Node A's context +└── validate() └── onValidation() + └── inject into TMValidation └── extract from TMValidation +``` + +## Wiring Points + +| Message | Inject Location | Extract Location | Protobuf Field | +| --------------- | ---------------------------------- | ----------------------------------- | -------------------------- | +| `TMProposeSet` | `Adaptor::propose()` | `PeerImp::onMessage(TMProposeSet)` | field 1001: `TraceContext` | +| `TMValidation` | `Adaptor::validate()` | `PeerImp::onMessage(TMValidation)` | field 1001: `TraceContext` | +| `TMTransaction` | `NetworkOPs::processTransaction()` | `PeerImp::onMessage(TMTransaction)` | field 1001: `TraceContext` | + +## Span Link Semantics + +Received messages use **span links** (follows-from), NOT parent-child: + +- The receiver's processing span links to the sender's context +- This preserves each node's independent trace tree +- Cross-node correlation visible via linked traces in Tempo/Jaeger + +## Interaction with Deterministic Trace ID (Strategy A) + +When using deterministic trace_id (Phase 4a default), cross-node spans already +share the same trace_id. P2P propagation adds **span-level** linking: + +- Without propagation: spans from different nodes appear in the same trace + (same trace_id) but without parent-child or follows-from relationships. +- With propagation: spans have explicit links showing which proposal/validation + from Node A caused processing on Node B. + +## Prerequisites + +- Phase 4a (this task list) — establish phase tracing must be in place +- `TraceContextPropagator` class (already exists in + `include/xrpl/telemetry/TraceContextPropagator.h`) +- Protobuf `TraceContext` message (already exists, field 1001) diff --git a/cspell.config.yaml b/cspell.config.yaml index efac79ffaa..b9af25a112 100644 --- a/cspell.config.yaml +++ b/cspell.config.yaml @@ -220,6 +220,7 @@ words: - qalloc - queuable - Raphson + - reparent - replayer - rerere - retriable diff --git a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml index 188a5e095b..27b6596b0c 100644 --- a/docker/telemetry/grafana/provisioning/datasources/tempo.yaml +++ b/docker/telemetry/grafana/provisioning/datasources/tempo.yaml @@ -8,6 +8,7 @@ # Phase 1b (infra): Base filters — node identity, service, span name, status. # Phase 2 (RPC): RPC command, status, role filters. # Phase 3 (TX): Transaction hash, local/peer origin, status. +# Phase 4 (Cons): Consensus mode, round, ledger sequence, close time. apiVersion: 1 @@ -134,3 +135,34 @@ datasources: operator: "=" scope: span type: dynamic + # Phase 4: Consensus tracing filters + - id: consensus-mode + tag: xrpl.consensus.mode + operator: "=" + scope: span + type: static + - id: consensus-round + tag: xrpl.consensus.round + operator: "=" + scope: span + type: dynamic + - id: consensus-ledger-seq + tag: xrpl.consensus.ledger.seq + operator: "=" + scope: span + type: static + - id: consensus-close-time-correct + tag: xrpl.consensus.close_time_correct + operator: "=" + scope: span + type: dynamic + - id: consensus-state + tag: xrpl.consensus.state + operator: "=" + scope: span + type: dynamic + - id: consensus-close-resolution + tag: xrpl.consensus.close_resolution_ms + operator: "=" + scope: span + type: dynamic diff --git a/include/xrpl/telemetry/SpanGuard.h b/include/xrpl/telemetry/SpanGuard.h index 3cc11f7654..438d766335 100644 --- a/include/xrpl/telemetry/SpanGuard.h +++ b/include/xrpl/telemetry/SpanGuard.h @@ -118,8 +118,10 @@ #include #include +#include #include #include +#include namespace xrpl::telemetry { @@ -131,6 +133,11 @@ namespace xrpl::telemetry { */ enum class TraceCategory { Rpc, Transactions, Consensus, Peer, Ledger }; +/** Key-value pair for span event attributes. + Used by addEvent(name, attrs) to attach structured metadata to events. +*/ +using EventAttribute = std::pair; + /** Opaque wrapper for an OTel context snapshot. Used to propagate trace context across threads. Created by @@ -328,6 +335,14 @@ public: void addEvent(std::string_view name); + /** Add a named event with key-value attributes to the span's timeline. + No-op on a null guard. + @param name Event name. + @param attrs Attribute pairs (all string_view for simplicity). + */ + void + addEvent(std::string_view name, std::initializer_list attrs); + /** Record an exception as a span event following OTel semantic conventions, and mark the span status as error. No-op on a null guard. @@ -452,6 +467,10 @@ public: { } void + addEvent(std::string_view, std::initializer_list) + { + } + void recordException(std::exception const&) { } diff --git a/include/xrpl/telemetry/Telemetry.h b/include/xrpl/telemetry/Telemetry.h index d3b729815a..c74fc3bb7b 100644 --- a/include/xrpl/telemetry/Telemetry.h +++ b/include/xrpl/telemetry/Telemetry.h @@ -188,6 +188,13 @@ public: /** Enable tracing for ledger close/accept. */ bool traceLedger = true; + + /** Strategy for cross-node consensus trace correlation. + "deterministic" — derive trace_id from ledger hash so all + validators in the same round share the same trace_id. + "attribute" — random trace_id, correlate via ledger_id attribute. + */ + std::string consensusTraceStrategy = "deterministic"; }; virtual ~Telemetry() = default; @@ -245,6 +252,10 @@ public: virtual bool shouldTraceLedger() const = 0; + /** @return The configured consensus trace correlation strategy. */ + virtual std::string const& + getConsensusTraceStrategy() const = 0; + #ifdef XRPL_ENABLE_TELEMETRY /** Get or create a named tracer instance. diff --git a/src/libxrpl/telemetry/NullTelemetry.cpp b/src/libxrpl/telemetry/NullTelemetry.cpp index 6d57f77c69..51c134075e 100644 --- a/src/libxrpl/telemetry/NullTelemetry.cpp +++ b/src/libxrpl/telemetry/NullTelemetry.cpp @@ -84,6 +84,12 @@ public: return false; } + std::string const& + getConsensusTraceStrategy() const override + { + return setup_.consensusTraceStrategy; + } + #ifdef XRPL_ENABLE_TELEMETRY opentelemetry::nostd::shared_ptr getTracer(std::string_view) override diff --git a/src/libxrpl/telemetry/SpanGuard.cpp b/src/libxrpl/telemetry/SpanGuard.cpp index dd5997a2b5..6a037eda62 100644 --- a/src/libxrpl/telemetry/SpanGuard.cpp +++ b/src/libxrpl/telemetry/SpanGuard.cpp @@ -43,6 +43,7 @@ #include #include #include +#include namespace xrpl { namespace telemetry { @@ -298,6 +299,40 @@ SpanGuard::hashSpan( return SpanGuard(std::make_unique(tel->startSpan(std::string(name), parentCtx))); } +// ===== Hash-derived span (generic, category-gated) ========================= + +SpanGuard +SpanGuard::hashSpan( + TraceCategory cat, + std::string_view name, + std::uint8_t const* hashData, + std::size_t hashSize) +{ + if (hashSize < 16) + return {}; + auto* tel = Telemetry::getInstance(); + if (!tel || !tel->isEnabled() || !isCategoryEnabled(*tel, cat)) + return {}; + + otel_trace::TraceId traceId(opentelemetry::nostd::span(hashData, 16)); + + std::uint8_t spanIdBytes[8]; + std::random_device rd; + for (auto& b : spanIdBytes) + b = static_cast(rd()); + otel_trace::SpanId spanId(opentelemetry::nostd::span(spanIdBytes, 8)); + + otel_trace::SpanContext syntheticCtx( + traceId, spanId, otel_trace::TraceFlags(1), /* remote = */ false); + + auto parentCtx = opentelemetry::context::Context{}.SetValue( + otel_trace::kSpanKey, + opentelemetry::nostd::shared_ptr( + new otel_trace::DefaultSpan(syntheticCtx))); + + return SpanGuard(std::make_unique(tel->startSpan(std::string(name), parentCtx))); +} + // ===== Context capture ===================================================== SpanContext @@ -370,6 +405,19 @@ SpanGuard::addEvent(std::string_view name) impl_->span->AddEvent(std::string(name)); } +void +SpanGuard::addEvent(std::string_view name, std::initializer_list attrs) +{ + if (!impl_) + return; + // Own the strings to ensure lifetime safety through the AddEvent call. + std::vector> owned; + owned.reserve(attrs.size()); + for (auto const& [k, v] : attrs) + owned.emplace_back(std::string(k), std::string(v)); + impl_->span->AddEvent(std::string(name), owned); +} + void SpanGuard::recordException(std::exception const& e) { diff --git a/src/libxrpl/telemetry/Telemetry.cpp b/src/libxrpl/telemetry/Telemetry.cpp index 1aba913b25..fb70fd66db 100644 --- a/src/libxrpl/telemetry/Telemetry.cpp +++ b/src/libxrpl/telemetry/Telemetry.cpp @@ -192,6 +192,12 @@ public: return false; } + std::string const& + getConsensusTraceStrategy() const override + { + return setup_.consensusTraceStrategy; + } + opentelemetry::nostd::shared_ptr getTracer(std::string_view) override { @@ -359,6 +365,12 @@ public: return setup_.traceLedger; } + std::string const& + getConsensusTraceStrategy() const override + { + return setup_.consensusTraceStrategy; + } + opentelemetry::nostd::shared_ptr getTracer(std::string_view name) override { diff --git a/src/libxrpl/telemetry/TelemetryConfig.cpp b/src/libxrpl/telemetry/TelemetryConfig.cpp index 16a1461286..be93476ae3 100644 --- a/src/libxrpl/telemetry/TelemetryConfig.cpp +++ b/src/libxrpl/telemetry/TelemetryConfig.cpp @@ -75,6 +75,9 @@ setup_Telemetry( setup.tracePeer = section.value_or("trace_peer", 0) != 0; setup.traceLedger = section.value_or("trace_ledger", 1) != 0; + setup.consensusTraceStrategy = + section.value_or("consensus_trace_strategy", "deterministic"); + return setup; } diff --git a/src/tests/libxrpl/telemetry/SpanGuardFactory.cpp b/src/tests/libxrpl/telemetry/SpanGuardFactory.cpp index 89f6283bca..59ea205a69 100644 --- a/src/tests/libxrpl/telemetry/SpanGuardFactory.cpp +++ b/src/tests/libxrpl/telemetry/SpanGuardFactory.cpp @@ -1,4 +1,5 @@ #include +#include #include @@ -75,3 +76,26 @@ TEST(SpanGuardFactory, discard_safe_on_null) span.discard(); EXPECT_FALSE(span); } + +TEST(SpanGuardFactory, consensus_close_time_attributes) +{ + // Verify the consensus attribute pattern compiles and + // doesn't crash with null SpanGuard. + { + auto span = telemetry::SpanGuard::span( + telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "accept.apply"); + span.setAttribute("xrpl.consensus.ledger.seq", static_cast(42)); + span.setAttribute("xrpl.consensus.close_time", static_cast(780000000)); + span.setAttribute("xrpl.consensus.close_time_correct", true); + span.setAttribute("xrpl.consensus.close_resolution_ms", static_cast(30000)); + span.setAttribute("xrpl.consensus.state", std::string("finished")); + span.setAttribute("xrpl.consensus.proposing", true); + span.setAttribute("xrpl.consensus.round_time_ms", static_cast(3500)); + } + { + auto span = telemetry::SpanGuard::span( + telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "accept.apply"); + span.setAttribute("xrpl.consensus.close_time_correct", false); + span.setAttribute("xrpl.consensus.state", std::string("moved_on")); + } +} diff --git a/src/xrpld/app/consensus/ConsensusSpanNames.h b/src/xrpld/app/consensus/ConsensusSpanNames.h new file mode 100644 index 0000000000..d668d3df67 --- /dev/null +++ b/src/xrpld/app/consensus/ConsensusSpanNames.h @@ -0,0 +1,156 @@ +#pragma once + +/** Compile-time span name constants for consensus tracing. + * + * Used by RCLConsensus (app) and Consensus.h (template) for + * consensus lifecycle spans. Built on StaticStr/join() from SpanNames.h. + * + * Span hierarchy: + * + * consensus.round (deterministic trace_id from ledger hash) + * | + * +-- consensus.proposal.send + * +-- consensus.ledger_close + * +-- consensus.establish + * +-- consensus.update_positions + * +-- consensus.check + * +-- consensus.accept + * +-- consensus.accept.apply (jtACCEPT thread) + * +-- consensus.validation.send (jtACCEPT thread, linked) + * +-- consensus.mode_change + */ + +#include + +namespace xrpl { +namespace telemetry { +namespace cons_span { + +// ===== Span name segments ==================================================== + +namespace op { +inline constexpr auto round = makeStr("round"); +inline constexpr auto proposalSend = makeStr("proposal.send"); +inline constexpr auto ledgerClose = makeStr("ledger_close"); +inline constexpr auto establish = makeStr("establish"); +inline constexpr auto updatePositions = makeStr("update_positions"); +inline constexpr auto check = makeStr("check"); +inline constexpr auto accept = makeStr("accept"); +inline constexpr auto acceptApply = makeStr("accept.apply"); +inline constexpr auto validationSend = makeStr("validation.send"); +inline constexpr auto modeChange = makeStr("mode_change"); +} // namespace op + +// ===== Full span names (prefix.op) =========================================== + +inline constexpr auto round = join(seg::consensus, op::round); +inline constexpr auto proposalSend = join(seg::consensus, op::proposalSend); +inline constexpr auto ledgerClose = join(seg::consensus, op::ledgerClose); +inline constexpr auto establish = join(seg::consensus, op::establish); +inline constexpr auto updatePositions = join(seg::consensus, op::updatePositions); +inline constexpr auto check = join(seg::consensus, op::check); +inline constexpr auto accept = join(seg::consensus, op::accept); +inline constexpr auto acceptApply = join(seg::consensus, op::acceptApply); +inline constexpr auto validationSend = join(seg::consensus, op::validationSend); +inline constexpr auto modeChange = join(seg::consensus, op::modeChange); + +// ===== Attribute keys ======================================================== + +namespace attr { +inline constexpr auto xrplConsensus = join(seg::xrpl, seg::consensus); + +/// "xrpl.consensus.ledger_id" +inline constexpr auto ledgerId = join(xrplConsensus, makeStr("ledger_id")); +/// "xrpl.consensus.ledger.seq" +inline constexpr auto ledgerSeq = join(xrplConsensus, makeStr("ledger.seq")); +/// "xrpl.consensus.mode" +inline constexpr auto mode = join(xrplConsensus, makeStr("mode")); +/// "xrpl.consensus.round" +inline constexpr auto round = join(xrplConsensus, makeStr("round")); +/// "xrpl.consensus.proposers" +inline constexpr auto proposers = join(xrplConsensus, makeStr("proposers")); +/// "xrpl.consensus.round_time_ms" +inline constexpr auto roundTimeMs = join(xrplConsensus, makeStr("round_time_ms")); +/// "xrpl.consensus.proposing" +inline constexpr auto proposing = join(xrplConsensus, makeStr("proposing")); +/// "xrpl.consensus.state" +inline constexpr auto state = join(xrplConsensus, makeStr("state")); + +// Close time attributes +/// "xrpl.consensus.close_time" +inline constexpr auto closeTime = join(xrplConsensus, makeStr("close_time")); +/// "xrpl.consensus.close_time_correct" +inline constexpr auto closeTimeCorrect = join(xrplConsensus, makeStr("close_time_correct")); +/// "xrpl.consensus.close_resolution_ms" +inline constexpr auto closeResolutionMs = join(xrplConsensus, makeStr("close_resolution_ms")); +/// "xrpl.consensus.parent_close_time" +inline constexpr auto parentCloseTime = join(xrplConsensus, makeStr("parent_close_time")); +/// "xrpl.consensus.close_time_self" +inline constexpr auto closeTimeSelf = join(xrplConsensus, makeStr("close_time_self")); +/// "xrpl.consensus.close_time_vote_bins" +inline constexpr auto closeTimeVoteBins = join(xrplConsensus, makeStr("close_time_vote_bins")); +/// "xrpl.consensus.resolution_direction" +inline constexpr auto resolutionDirection = join(xrplConsensus, makeStr("resolution_direction")); + +// Establish/convergence attributes +/// "xrpl.consensus.converge_percent" +inline constexpr auto convergePercent = join(xrplConsensus, makeStr("converge_percent")); +/// "xrpl.consensus.establish_count" +inline constexpr auto establishCount = join(xrplConsensus, makeStr("establish_count")); +/// "xrpl.consensus.proposers_agreed" +inline constexpr auto proposersAgreed = join(xrplConsensus, makeStr("proposers_agreed")); + +// Consensus check attributes +/// "xrpl.consensus.agree_count" +inline constexpr auto agreeCount = join(xrplConsensus, makeStr("agree_count")); +/// "xrpl.consensus.disagree_count" +inline constexpr auto disagreeCount = join(xrplConsensus, makeStr("disagree_count")); +/// "xrpl.consensus.threshold_percent" +inline constexpr auto thresholdPercent = join(xrplConsensus, makeStr("threshold_percent")); +/// "xrpl.consensus.result" +inline constexpr auto result = join(xrplConsensus, makeStr("result")); +/// "xrpl.consensus.quorum" +inline constexpr auto quorum = join(xrplConsensus, makeStr("quorum")); +/// "xrpl.consensus.validation_count" +inline constexpr auto validationCount = join(xrplConsensus, makeStr("validation_count")); + +// Trace strategy attribute +/// "xrpl.consensus.trace_strategy" +inline constexpr auto traceStrategy = join(xrplConsensus, makeStr("trace_strategy")); +/// "xrpl.consensus.round_id" +inline constexpr auto roundId = join(xrplConsensus, makeStr("round_id")); + +// Mode change attributes +/// "xrpl.consensus.mode.old" +inline constexpr auto modeOld = join(xrplConsensus, makeStr("mode.old")); +/// "xrpl.consensus.mode.new" +inline constexpr auto modeNew = join(xrplConsensus, makeStr("mode.new")); + +// Dispute event attributes +/// "xrpl.tx.id" +inline constexpr auto txId = join(join(seg::xrpl, seg::tx), makeStr("id")); +/// "xrpl.dispute.our_vote" +inline constexpr auto disputeOurVote = + join(join(seg::xrpl, makeStr("dispute")), makeStr("our_vote")); +/// "xrpl.dispute.yays" +inline constexpr auto disputeYays = join(join(seg::xrpl, makeStr("dispute")), makeStr("yays")); +/// "xrpl.dispute.nays" +inline constexpr auto disputeNays = join(join(seg::xrpl, makeStr("dispute")), makeStr("nays")); +} // namespace attr + +// ===== Attribute values ====================================================== + +namespace val { +inline constexpr auto finished = makeStr("finished"); +inline constexpr auto movedOn = makeStr("moved_on"); +inline constexpr auto yes = makeStr("yes"); +inline constexpr auto no = makeStr("no"); +inline constexpr auto expired = makeStr("expired"); +inline constexpr auto increased = makeStr("increased"); +inline constexpr auto decreased = makeStr("decreased"); +inline constexpr auto unchanged = makeStr("unchanged"); +} // namespace val + +} // namespace cons_span +} // namespace telemetry +} // namespace xrpl diff --git a/src/xrpld/app/consensus/RCLConsensus.cpp b/src/xrpld/app/consensus/RCLConsensus.cpp index 6d99c2ee15..012f445b22 100644 --- a/src/xrpld/app/consensus/RCLConsensus.cpp +++ b/src/xrpld/app/consensus/RCLConsensus.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -225,6 +226,11 @@ RCLConsensus::Adaptor::share(RCLCxTx const& tx) void RCLConsensus::Adaptor::propose(RCLCxPeerPos::Proposal const& proposal) { + auto span = telemetry::SpanGuard::span( + telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "proposal.send"); + span.setAttribute( + telemetry::cons_span::attr::round, static_cast(proposal.proposeSeq())); + JLOG(j_.trace()) << (proposal.isBowOut() ? "We bow out: " : "We propose: ") << xrpl::to_string(proposal.prevLedger()) << " -> " << xrpl::to_string(proposal.position()); @@ -327,6 +333,13 @@ RCLConsensus::Adaptor::onClose( NetClock::time_point const& closeTime, ConsensusMode mode) -> Result { + auto span = telemetry::SpanGuard::span( + telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "ledger_close"); + span.setAttribute( + telemetry::cons_span::attr::ledgerSeq, + static_cast(ledger.ledger_->header().seq + 1)); + span.setAttribute(telemetry::cons_span::attr::mode, to_string(mode).c_str()); + bool const wrongLCL = mode == ConsensusMode::wrongLedger; bool const proposing = mode == ConsensusMode::proposing; @@ -435,6 +448,18 @@ RCLConsensus::Adaptor::onAccept( Json::Value&& consensusJson, bool const validating) { + { + auto span = telemetry::SpanGuard::span( + telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "accept"); + span.setAttribute( + telemetry::cons_span::attr::proposers, static_cast(result.proposers)); + span.setAttribute( + telemetry::cons_span::attr::roundTimeMs, + static_cast(result.roundTime.read().count())); + span.setAttribute( + telemetry::cons_span::attr::quorum, static_cast(result.proposers)); + } + app_.getJobQueue().addJob( jtACCEPT, "AcceptLedger", @@ -486,6 +511,41 @@ RCLConsensus::Adaptor::doAccept( closeTimeCorrect = true; } + auto doAcceptSpan = telemetry::SpanGuard::span( + telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "accept.apply"); + doAcceptSpan.setAttribute( + telemetry::cons_span::attr::ledgerSeq, static_cast(prevLedger.seq() + 1)); + doAcceptSpan.setAttribute( + telemetry::cons_span::attr::closeTime, + static_cast(consensusCloseTime.time_since_epoch().count())); + doAcceptSpan.setAttribute(telemetry::cons_span::attr::closeTimeCorrect, closeTimeCorrect); + doAcceptSpan.setAttribute( + telemetry::cons_span::attr::closeResolutionMs, + static_cast( + std::chrono::duration_cast(closeResolution).count())); + doAcceptSpan.setAttribute( + telemetry::cons_span::attr::state, std::string(consensusFail ? "moved_on" : "finished")); + doAcceptSpan.setAttribute(telemetry::cons_span::attr::proposing, proposing); + doAcceptSpan.setAttribute( + telemetry::cons_span::attr::roundTimeMs, + static_cast(result.roundTime.read().count())); + doAcceptSpan.setAttribute( + telemetry::cons_span::attr::parentCloseTime, + static_cast(prevLedger.closeTime().time_since_epoch().count())); + doAcceptSpan.setAttribute( + telemetry::cons_span::attr::closeTimeSelf, + static_cast(rawCloseTimes.self.time_since_epoch().count())); + doAcceptSpan.setAttribute( + telemetry::cons_span::attr::closeTimeVoteBins, + static_cast(rawCloseTimes.peers.size())); + { + auto const prevRes = prevLedger.closeTimeResolution(); + std::string dir = (closeResolution > prevRes) ? "increased" + : (closeResolution < prevRes) ? "decreased" + : "unchanged"; + doAcceptSpan.setAttribute(telemetry::cons_span::attr::resolutionDirection, std::move(dir)); + } + JLOG(j_.debug()) << "Report: Prop=" << (proposing ? "yes" : "no") << " val=" << (validating_ ? "yes" : "no") << " corLCL=" << (haveCorrectLCL ? "yes" : "no") @@ -803,6 +863,14 @@ RCLConsensus::Adaptor::buildLCL( void RCLConsensus::Adaptor::validate(RCLCxLedger const& ledger, RCLTxSet const& txns, bool proposing) { + auto valSpan = createValidationSpan(); + if (valSpan) + { + valSpan->setAttribute( + telemetry::cons_span::attr::ledgerSeq, static_cast(ledger.seq())); + valSpan->setAttribute(telemetry::cons_span::attr::proposing, proposing); + } + using namespace std::chrono_literals; auto validationTime = app_.getTimeKeeper().closeTime(); @@ -890,6 +958,11 @@ RCLConsensus::Adaptor::validate(RCLCxLedger const& ledger, RCLTxSet const& txns, void RCLConsensus::Adaptor::onModeChange(ConsensusMode before, ConsensusMode after) { + auto span = telemetry::SpanGuard::span( + telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "mode_change"); + span.setAttribute(telemetry::cons_span::attr::modeOld, to_string(before).c_str()); + span.setAttribute(telemetry::cons_span::attr::modeNew, to_string(after).c_str()); + JLOG(j_.info()) << "Consensus mode change before=" << to_string(before) << ", after=" << to_string(after); @@ -1012,6 +1085,8 @@ RCLConsensus::Adaptor::preStartRound(RCLCxLedger const& prevLgr, hash_setcaptureContext(); + roundSpan_.reset(); + } + + auto const& strategy = app_.getTelemetry().getConsensusTraceStrategy(); + + if (strategy == "deterministic") + { + roundSpan_.emplace( + SpanGuard::hashSpan( + TraceCategory::Consensus, + cons_span::round, + prevLgr.id().data(), + prevLgr.id().bytes)); + } + else + { + roundSpan_.emplace(SpanGuard::span(TraceCategory::Consensus, seg::consensus, "round")); + } + + if (!*roundSpan_) + return; + + if (prevRoundContext_.isValid()) + { + // Create a linked span to establish follows-from relationship + // between consecutive rounds, then transfer to roundSpan_. + auto linked = SpanGuard::linkedSpan(cons_span::round, prevRoundContext_); + if (linked) + { + roundSpan_.emplace(std::move(linked)); + } + } + + roundSpan_->setAttribute(cons_span::attr::ledgerId, to_string(prevLgr.id()).c_str()); + roundSpan_->setAttribute(cons_span::attr::ledgerSeq, static_cast(prevLgr.seq() + 1)); + roundSpan_->setAttribute(cons_span::attr::mode, to_string(mode_.load()).c_str()); + roundSpan_->setAttribute(cons_span::attr::traceStrategy, strategy.c_str()); + roundSpan_->setAttribute(cons_span::attr::roundId, static_cast(prevLgr.seq() + 1)); + + roundSpanContext_ = roundSpan_->captureContext(); +} + +std::optional +RCLConsensus::Adaptor::createValidationSpan() +{ + using namespace telemetry; + + if (!roundSpanContext_.isValid()) + return std::nullopt; + + return SpanGuard::linkedSpan(cons_span::validationSend, roundSpanContext_); +} + void RCLConsensus::startRound( NetClock::time_point const& now, diff --git a/src/xrpld/app/consensus/RCLConsensus.h b/src/xrpld/app/consensus/RCLConsensus.h index c965ed3d87..c3e804332c 100644 --- a/src/xrpld/app/consensus/RCLConsensus.h +++ b/src/xrpld/app/consensus/RCLConsensus.h @@ -12,10 +12,12 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -68,6 +70,31 @@ class RCLConsensus RCLCensorshipDetector censorshipDetector_; NegativeUNLVote nUnlVote_; + /** Span for the current consensus round. + * + * Created in preStartRound(), ended (via reset()) when the next + * round begins. When consensusTraceStrategy is "deterministic", + * the trace_id is derived from previousLedger.id() so that all + * validators in the same round share the same trace_id. + */ + std::optional roundSpan_; + + /** Context captured from the previous consensus round. + * + * Used to create span links (follows-from) between consecutive + * rounds, establishing a causal chain in the trace backend. + */ + telemetry::SpanContext prevRoundContext_; + + /** SpanContext snapshot of the current round span. + * + * Captured in startRoundTracing() as a lightweight value-type copy + * so that createValidationSpan() — which runs on the jtACCEPT + * worker thread — can build span links without accessing roundSpan_ + * across threads. + */ + telemetry::SpanContext roundSpanContext_; + public: using Ledger_t = RCLCxLedger; using NodeID_t = NodeID; @@ -156,6 +183,27 @@ class RCLConsensus return parms_; } + /** Set up the consensus round span and link it to the previous round. + * + * Saves the previous round's context for span-link construction, + * ends the old round span, and creates a new "consensus.round" span. + * Depending on the configured trace strategy the trace_id is either + * deterministic (derived from prevLgr hash) or random. + * + * @param prevLgr The ledger that will be the prior ledger for the + * new round. + */ + void + startRoundTracing(RCLCxLedger const& prevLgr); + + /** Create the "consensus.validation.send" span linked to the round. + * + * @return An engaged optional SpanGuard if tracing is active, + * std::nullopt otherwise. + */ + std::optional + createValidationSpan(); + private: //--------------------------------------------------------------------- // The following members implement the generic Consensus requirements diff --git a/src/xrpld/consensus/Consensus.h b/src/xrpld/consensus/Consensus.h index 9edbebd429..5e41242322 100644 --- a/src/xrpld/consensus/Consensus.h +++ b/src/xrpld/consensus/Consensus.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -10,6 +11,7 @@ #include #include #include +#include #include #include @@ -601,6 +603,21 @@ private: // nodes that have bowed out of this consensus process hash_set deadNodes_; + /** Span for the establish phase of consensus. + * Created when the ledger closes and we enter phaseEstablish; + * cleared (ended) when consensus is reached. + */ + std::optional establishSpan_; + + void + startEstablishTracing(); + + void + updateEstablishTracing(); + + void + endEstablishTracing(); + // Journal for debugging beast::Journal const j_; }; @@ -1327,6 +1344,8 @@ Consensus::phaseEstablish(std::unique_ptr const& clo XRPL_ASSERT(result_, "xrpl::Consensus::phaseEstablish : result is set"); // NOLINTBEGIN(bugprone-unchecked-optional-access) assert above + startEstablishTracing(); + ++peerUnchangedCounter_; ++establishCounter_; @@ -1354,6 +1373,8 @@ Consensus::phaseEstablish(std::unique_ptr const& clo updateOurPositions(clog); + updateEstablishTracing(); + // Nothing to do if too many laggards or we don't have consensus. if (shouldPause(clog) || !haveConsensus(clog)) return; @@ -1371,6 +1392,7 @@ Consensus::phaseEstablish(std::unique_ptr const& clo adaptor_.updateOperatingMode(currPeerPositions_.size()); prevProposers_ = currPeerPositions_.size(); prevRoundTime_ = result_->roundTime.read(); + endEstablishTracing(); phase_ = ConsensusPhase::accepted; JLOG(j_.debug()) << "transitioned to ConsensusPhase::accepted"; adaptor_.onAccept( @@ -1447,6 +1469,10 @@ Consensus::updateOurPositions(std::unique_ptr const& // We must have a position if we are updating it XRPL_ASSERT(result_, "xrpl::Consensus::updateOurPositions : result is set"); // NOLINTBEGIN(bugprone-unchecked-optional-access) assert above + using namespace telemetry; + auto span = SpanGuard::span(TraceCategory::Consensus, seg::consensus, "update_positions"); + span.setAttribute(cons_span::attr::convergePercent, static_cast(convergePercent_)); + span.setAttribute(cons_span::attr::proposers, static_cast(currPeerPositions_.size())); ConsensusParms const& parms = adaptor_.parms(); // Compute a cutoff time @@ -1506,6 +1532,11 @@ Consensus::updateOurPositions(std::unique_ptr const& // now a no mutableSet->erase(txId); } + + span.addEvent( + "dispute.resolve", + {{cons_span::attr::txId, to_string(txId)}, + {cons_span::attr::disputeOurVote, dispute.getOurVote() ? "yes" : "no"}}); } } @@ -1629,6 +1660,8 @@ Consensus::haveConsensus(std::unique_ptr const& clog // Must have a stance if we are checking for consensus XRPL_ASSERT(result_, "xrpl::Consensus::haveConsensus : has result"); // NOLINTBEGIN(bugprone-unchecked-optional-access) assert above + using namespace telemetry; + auto span = SpanGuard::span(TraceCategory::Consensus, seg::consensus, "check"); // CHECKME: should possibly count unacquired TX sets as disagreeing int agree = 0, disagree = 0; @@ -1728,6 +1761,17 @@ Consensus::haveConsensus(std::unique_ptr const& clog CLOG(clog) << "Unable to reach consensus " << Json::Compact{getJson(true)} << ". "; } + span.setAttribute(cons_span::attr::agreeCount, static_cast(agree)); + span.setAttribute(cons_span::attr::disagreeCount, static_cast(disagree)); + span.setAttribute(cons_span::attr::convergePercent, static_cast(convergePercent_)); + + char const* stateStr = "no"; + if (result_->state == ConsensusState::Yes) + stateStr = "yes"; + else if (result_->state == ConsensusState::MovedOn) + stateStr = "moved_on"; + span.setAttribute(cons_span::attr::result, stateStr); + CLOG(clog) << "Consensus has been reached. "; // NOLINTEND(bugprone-unchecked-optional-access) return true; @@ -1849,4 +1893,36 @@ Consensus::asCloseTime(NetClock::time_point raw) const return roundCloseTime(raw, closeResolution_); } +template +void +Consensus::startEstablishTracing() +{ + if (establishSpan_) + return; + establishSpan_.emplace( + telemetry::SpanGuard::span( + telemetry::TraceCategory::Consensus, telemetry::seg::consensus, "establish")); +} + +template +void +Consensus::updateEstablishTracing() +{ + if (!establishSpan_) + return; + establishSpan_->setAttribute( + telemetry::cons_span::attr::convergePercent, static_cast(convergePercent_)); + establishSpan_->setAttribute( + telemetry::cons_span::attr::establishCount, static_cast(establishCounter_)); + establishSpan_->setAttribute( + telemetry::cons_span::attr::proposers, static_cast(currPeerPositions_.size())); +} + +template +void +Consensus::endEstablishTracing() +{ + establishSpan_.reset(); +} + } // namespace xrpl diff --git a/src/xrpld/consensus/DisputedTx.h b/src/xrpld/consensus/DisputedTx.h index aff4ccae68..2629feef5e 100644 --- a/src/xrpld/consensus/DisputedTx.h +++ b/src/xrpld/consensus/DisputedTx.h @@ -176,6 +176,20 @@ public: [[nodiscard]] Json::Value getJson() const; + //! Number of peers voting yes. + int + getYays() const + { + return yays_; + } + + //! Number of peers voting no. + int + getNays() const + { + return nays_; + } + private: int yays_{0}; //< Number of yes votes int nays_{0}; //< Number of no votes