feat(telemetry): add ledger.acquire span for inbound ledger fetch

InboundLedger drives ledger back-fill and fork recovery with timeout/retry
logic (kLedgerTimeoutRetriesMax = 6), but emitted only a global ledger_fetches
counter — sync/recovery cost was a telemetry blind spot.

Add a ledger.acquire span that wraps the acquisition lifecycle:
- Started in InboundLedger::init() with ledger_seq and acquire_reason
  (history / consensus / generic, mirroring InboundLedger::Reason).
- Finalized in InboundLedger::done() with outcome (complete / failed),
  timeouts, and peer_count, then reset so the span duration is exported.

Held as a std::optional<SpanGuard> member (same pattern as RCLConsensus
roundSpan_). New op/attr/val constants added to LedgerSpanNames.h. Compiles to
a no-op when telemetry is disabled via the SpanGuard fallback.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-06-04 16:11:57 +01:00
parent 793d2ecfce
commit 864ac729de
3 changed files with 64 additions and 0 deletions

View File

@@ -6,8 +6,10 @@
#include <xrpl/basics/CountedObject.h>
#include <xrpl/ledger/Ledger.h>
#include <xrpl/telemetry/SpanGuard.h>
#include <mutex>
#include <optional>
#include <set>
#include <utility>
@@ -170,6 +172,12 @@ private:
receivedData_;
bool receiveDispatched_{false};
std::unique_ptr<PeerSet> peerSet_;
/// Spans the acquire lifecycle: started in init(), finalized in done()
/// with the outcome (complete/failed), timeout count, and peer count.
/// Gives operators visibility into back-fill / fork-recovery cost, which
/// previously emitted no span or metric.
std::optional<telemetry::SpanGuard> acquireSpan_;
};
} // namespace xrpl

View File

@@ -4,6 +4,7 @@
#include <xrpld/app/ledger/InboundLedgers.h>
#include <xrpld/app/ledger/LedgerMaster.h>
#include <xrpld/app/ledger/TransactionStateSF.h>
#include <xrpld/app/ledger/detail/LedgerSpanNames.h>
#include <xrpld/app/ledger/detail/TimeoutCounter.h>
#include <xrpld/app/main/Application.h>
#include <xrpld/overlay/Message.h>
@@ -30,6 +31,8 @@
#include <xrpl/resource/Fees.h>
#include <xrpl/shamap/SHAMapNodeID.h>
#include <xrpl/shamap/SHAMapSyncFilter.h>
#include <xrpl/telemetry/SpanGuard.h>
#include <xrpl/telemetry/SpanNames.h>
#include <boost/iterator/function_output_iterator.hpp>
@@ -46,6 +49,7 @@
#include <sstream>
#include <stdexcept>
#include <string>
#include <string_view>
#include <tuple>
#include <unordered_map>
#include <utility>
@@ -95,6 +99,23 @@ InboundLedger::init(ScopedLockType& collectionLock)
ScopedLockType sl(mtx_);
collectionLock.unlock();
// Span the acquire lifecycle so back-fill / fork-recovery cost is
// observable. Finalized in done() with the outcome and timeout count.
{
using namespace telemetry;
acquireSpan_.emplace(
SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::acquire));
if (*acquireSpan_)
{
acquireSpan_->setAttribute(ledger_span::attr::ledgerSeq, static_cast<int64_t>(seq_));
std::string_view const reasonVal = reason_ == Reason::HISTORY
? std::string_view(ledger_span::val::history)
: reason_ == Reason::CONSENSUS ? std::string_view(ledger_span::val::consensus)
: std::string_view(ledger_span::val::generic);
acquireSpan_->setAttribute(ledger_span::attr::acquireReason, reasonVal);
}
}
tryDB(app_.getNodeFamily().db());
if (failed_)
return;
@@ -416,6 +437,21 @@ InboundLedger::done()
signaled_ = true;
touch();
// Finalize the acquire span with the outcome, timeout count, and peer
// count, then end it (reset) so its duration is exported.
if (acquireSpan_ && *acquireSpan_)
{
using namespace telemetry;
acquireSpan_->setAttribute(
ledger_span::attr::outcome,
failed_ ? std::string_view(ledger_span::val::failed)
: std::string_view(ledger_span::val::complete));
acquireSpan_->setAttribute(ledger_span::attr::timeouts, static_cast<int64_t>(timeouts_));
acquireSpan_->setAttribute(
ledger_span::attr::peerCount, static_cast<int64_t>(getPeerCount()));
}
acquireSpan_.reset();
JLOG(journal_.debug()) << "Acquire " << hash_ << (failed_ ? " fail " : " ")
<< ((timeouts_ == 0)
? std::string()

View File

@@ -10,6 +10,7 @@
* ledger.build (BuildLedger — ledger construction)
* ledger.store (LedgerMaster — ledger storage)
* ledger.validate (LedgerMaster — ledger validation acceptance)
* ledger.acquire (InboundLedger — fetch a missing ledger from peers)
* tx.apply (BuildLedger — transaction application)
*/
@@ -24,6 +25,7 @@ inline constexpr auto build = makeStr("build");
inline constexpr auto store = makeStr("store");
inline constexpr auto validate = makeStr("validate");
inline constexpr auto apply = makeStr("apply");
inline constexpr auto acquire = makeStr("acquire");
} // namespace op
// ===== Attribute keys ========================================================
@@ -40,6 +42,24 @@ using ::xrpl::telemetry::attr::ledgerSeq;
inline constexpr auto txCount = makeStr("tx_count");
inline constexpr auto txFailed = makeStr("tx_failed");
inline constexpr auto validations = makeStr("validations");
/// ledger.acquire attrs (InboundLedger fetch lifecycle).
inline constexpr auto acquireReason = makeStr("acquire_reason");
inline constexpr auto timeouts = makeStr("timeouts");
inline constexpr auto peerCount = makeStr("peer_count");
inline constexpr auto outcome = makeStr("outcome");
} // namespace attr
// ===== Attribute values ======================================================
namespace val {
/// ledger.acquire outcome values.
inline constexpr auto complete = makeStr("complete");
inline constexpr auto failed = makeStr("failed");
/// ledger.acquire reason values (mirror InboundLedger::Reason).
inline constexpr auto history = makeStr("history");
inline constexpr auto consensus = makeStr("consensus");
inline constexpr auto generic = makeStr("generic");
} // namespace val
} // namespace xrpl::telemetry::ledger_span