feat(telemetry): add Phase 6 StatsD metrics, ledger/peer spans, and expanded dashboards

Integrate the existing StatsD metrics pipeline (beast::insight) into
the OpenTelemetry observability stack and add new trace spans for
ledger build/store/validate and peer proposal/validation receive.

Phase 5b — Ledger, peer, and transaction spans:
- Add ledger.build span with close time attributes in BuildLedger.cpp
- Add tx.apply span with tx_count/tx_failed in BuildLedger.cpp
- Add ledger.store and ledger.validate spans in LedgerMaster.cpp
- Add peer.proposal.receive span with trusted attribute in PeerImp.cpp
- Add peer.validation.receive span with ledger_hash, full, trusted
  attributes in PeerImp.cpp
- Add ledger-operations and peer-network Grafana dashboards

Phase 6 — StatsD metrics integration:
- Add StatsD UDP receiver (port 8125) to OTel Collector
- Add 5 StatsD Grafana dashboards: node health, network traffic,
  overlay traffic detail, ledger data sync, RPC pathfinding
- Add 09-data-collection-reference.md cataloging all metrics/spans
- Update existing dashboards with new span panels
- Expand telemetry runbook and integration test script
- Add codecov exclusions for telemetry modules

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-04-27 11:43:56 +01:00
parent de7194011d
commit cbbd6ebee2
25 changed files with 4890 additions and 138 deletions

View File

@@ -2,6 +2,7 @@
#include <xrpld/app/ledger/LedgerReplay.h>
#include <xrpld/app/ledger/OpenLedger.h>
#include <xrpld/app/ledger/detail/LedgerSpanNames.h>
#include <xrpld/app/main/Application.h>
#include <xrpl/basics/Log.h>
@@ -13,8 +14,10 @@
#include <xrpl/ledger/Ledger.h>
#include <xrpl/ledger/OpenView.h>
#include <xrpl/nodestore/NodeObject.h>
#include <xrpl/protocol/Feature.h>
#include <xrpl/protocol/LedgerHeader.h>
#include <xrpl/protocol/Protocol.h>
#include <xrpl/telemetry/SpanGuard.h>
#include <xrpl/tx/apply.h>
#include <cstddef>
@@ -41,6 +44,9 @@ buildLedgerImpl(
beast::Journal j,
ApplyTxs&& applyTxs)
{
using namespace telemetry;
auto buildSpan = SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::build);
auto built = std::make_shared<Ledger>(*parent, closeTime);
if (built->isFlagLedger())
@@ -74,6 +80,14 @@ buildLedgerImpl(
built->header().seq < XRP_LEDGER_EARLIEST_FEES || built->read(keylet::fees()),
"xrpl::buildLedgerImpl : valid ledger fees");
built->setAccepted(closeTime, closeResolution, closeTimeCorrect);
buildSpan.setAttribute(ledger_span::attr::seq, static_cast<int64_t>(built->header().seq));
buildSpan.setAttribute(
ledger_span::attr::closeTime, static_cast<int64_t>(closeTime.time_since_epoch().count()));
buildSpan.setAttribute(ledger_span::attr::closeTimeCorrect, closeTimeCorrect);
buildSpan.setAttribute(
ledger_span::attr::closeResolutionMs,
static_cast<int64_t>(
std::chrono::duration_cast<std::chrono::milliseconds>(closeResolution).count()));
return built;
}
@@ -97,6 +111,9 @@ applyTransactions(
OpenView& view,
beast::Journal j)
{
using namespace telemetry;
auto applySpan = SpanGuard::span(TraceCategory::Transactions, seg::tx, ledger_span::op::apply);
bool certainRetry = true;
std::size_t count = 0;
@@ -163,6 +180,8 @@ applyTransactions(
// If there are any transactions left, we must have
// tried them in at least one final pass
XRPL_ASSERT(txns.empty() || !certainRetry, "xrpl::applyTransactions : retry transactions");
applySpan.setAttribute(ledger_span::attr::txCount, static_cast<int64_t>(count));
applySpan.setAttribute(ledger_span::attr::txFailed, static_cast<int64_t>(failed.size()));
return count;
}

View File

@@ -6,6 +6,7 @@
#include <xrpld/app/ledger/LedgerReplay.h>
#include <xrpld/app/ledger/LedgerReplayer.h>
#include <xrpld/app/ledger/OpenLedger.h>
#include <xrpld/app/ledger/detail/LedgerSpanNames.h>
#include <xrpld/app/main/Application.h>
#include <xrpld/app/misc/SHAMapStore.h>
#include <xrpld/app/misc/Transaction.h>
@@ -55,6 +56,7 @@
#include <xrpl/shamap/SHAMap.h>
#include <xrpl/shamap/SHAMapMissingNode.h>
#include <xrpl/shamap/SHAMapTreeNode.h>
#include <xrpl/telemetry/SpanGuard.h>
#include <boost/icl/concept/interval_set.hpp>
@@ -449,6 +451,10 @@ LedgerMaster::fixIndex(LedgerIndex ledgerIndex, LedgerHash const& ledgerHash)
bool
LedgerMaster::storeLedger(std::shared_ptr<Ledger const> ledger)
{
using namespace telemetry;
auto span = SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::store);
span.setAttribute(ledger_span::attr::seq, static_cast<int64_t>(ledger->header().seq));
bool const validated = ledger->header().validated;
// Returns true if we already had the ledger
return mLedgerHistory.insert(ledger, validated);
@@ -965,6 +971,11 @@ LedgerMaster::checkAccept(std::shared_ptr<Ledger const> const& ledger)
return;
}
using namespace telemetry;
auto valSpan = SpanGuard::span(TraceCategory::Ledger, seg::ledger, ledger_span::op::validate);
valSpan.setAttribute(ledger_span::attr::seq, static_cast<int64_t>(ledger->header().seq));
valSpan.setAttribute(ledger_span::attr::validations, static_cast<int64_t>(tvc));
JLOG(m_journal.info()) << "Advancing accepted ledger to " << ledger->header().seq
<< " with >= " << minVal << " validations";

View File

@@ -0,0 +1,54 @@
#pragma once
/** Compile-time span name constants for ledger tracing.
*
* Used by BuildLedger and LedgerMaster for ledger lifecycle spans.
* Built on StaticStr/join() from SpanNames.h.
*
* Span hierarchy:
*
* ledger.build (BuildLedger — ledger construction)
* ledger.store (LedgerMaster — ledger storage)
* ledger.validate (LedgerMaster — ledger validation acceptance)
* tx.apply (BuildLedger — transaction application)
*/
#include <xrpl/telemetry/SpanNames.h>
namespace xrpl {
namespace telemetry {
namespace ledger_span {
// ===== Span operation suffixes ===============================================
namespace op {
inline constexpr auto build = makeStr("build");
inline constexpr auto store = makeStr("store");
inline constexpr auto validate = makeStr("validate");
inline constexpr auto apply = makeStr("apply");
} // namespace op
// ===== Attribute keys ========================================================
namespace attr {
inline constexpr auto xrplLedger = join(seg::xrpl, seg::ledger);
/// "xrpl.ledger.seq"
inline constexpr auto seq = join(xrplLedger, makeStr("seq"));
/// "xrpl.ledger.close_time"
inline constexpr auto closeTime = join(xrplLedger, makeStr("close_time"));
/// "xrpl.ledger.close_time_correct"
inline constexpr auto closeTimeCorrect = join(xrplLedger, makeStr("close_time_correct"));
/// "xrpl.ledger.close_resolution_ms"
inline constexpr auto closeResolutionMs = join(xrplLedger, makeStr("close_resolution_ms"));
/// "xrpl.ledger.tx_count"
inline constexpr auto txCount = join(xrplLedger, makeStr("tx_count"));
/// "xrpl.ledger.tx_failed"
inline constexpr auto txFailed = join(xrplLedger, makeStr("tx_failed"));
/// "xrpl.ledger.validations"
inline constexpr auto validations = join(xrplLedger, makeStr("validations"));
} // namespace attr
} // namespace ledger_span
} // namespace telemetry
} // namespace xrpl

View File

@@ -16,6 +16,7 @@
#include <xrpld/overlay/ReduceRelayCommon.h>
#include <xrpld/overlay/detail/Handshake.h>
#include <xrpld/overlay/detail/OverlayImpl.h>
#include <xrpld/overlay/detail/PeerSpanNames.h>
#include <xrpld/overlay/detail/ProtocolMessage.h>
#include <xrpld/overlay/detail/ProtocolVersion.h>
#include <xrpld/overlay/detail/TrafficCount.h>
@@ -1863,6 +1864,10 @@ PeerImp::onMessage(std::shared_ptr<protocol::TMLedgerData> const& m)
void
PeerImp::onMessage(std::shared_ptr<protocol::TMProposeSet> const& m)
{
using namespace telemetry;
auto span = SpanGuard::span(TraceCategory::Peer, seg::peer, peer_span::op::proposalReceive);
span.setAttribute(peer_span::attr::id, static_cast<int64_t>(id_));
protocol::TMProposeSet const& set = *m;
auto const sig = makeSlice(set.signature());
@@ -1889,6 +1894,7 @@ PeerImp::onMessage(std::shared_ptr<protocol::TMProposeSet> const& m)
// every time a spam packet is received
PublicKey const publicKey{makeSlice(set.nodepubkey())};
auto const isTrusted = app_.getValidators().trusted(publicKey);
span.setAttribute(peer_span::attr::proposalTrusted, isTrusted);
// If the operator has specified that untrusted proposals be dropped then
// this happens here I.e. before further wasting CPU verifying the signature
@@ -2459,6 +2465,11 @@ PeerImp::onMessage(std::shared_ptr<protocol::TMValidatorListCollection> const& m
void
PeerImp::onMessage(std::shared_ptr<protocol::TMValidation> const& m)
{
using namespace telemetry;
auto valSpan =
SpanGuard::span(TraceCategory::Peer, seg::peer, peer_span::op::validationReceive);
valSpan.setAttribute(peer_span::attr::id, static_cast<int64_t>(id_));
if (m->validation().size() < 50)
{
JLOG(p_journal_.warn()) << "Validation: Too small";
@@ -2481,6 +2492,9 @@ PeerImp::onMessage(std::shared_ptr<protocol::TMValidation> const& m)
false);
val->setSeen(closeTime);
}
valSpan.setAttribute(
peer_span::attr::validationLedgerHash, to_string(val->getLedgerHash()).c_str());
valSpan.setAttribute(peer_span::attr::validationFull, val->isFull());
if (!isCurrent(
app_.getValidations().parms(),
@@ -2497,6 +2511,7 @@ PeerImp::onMessage(std::shared_ptr<protocol::TMValidation> const& m)
// suppression for 30 seconds to avoid doing a relatively expensive
// lookup every time a spam packet is received
auto const isTrusted = app_.getValidators().trusted(val->getSignerPublic());
valSpan.setAttribute(peer_span::attr::validationTrusted, isTrusted);
// If the operator has specified that untrusted validations be
// dropped then this happens here I.e. before further wasting CPU

View File

@@ -0,0 +1,50 @@
#pragma once
/** Compile-time span name constants for peer overlay tracing.
*
* Used by PeerImp for peer message handling spans (proposals,
* validations). Built on StaticStr/join() from SpanNames.h.
*
* Span hierarchy:
*
* peer.proposal.receive (PeerImp — incoming proposal)
* peer.validation.receive (PeerImp — incoming validation)
*/
#include <xrpl/telemetry/SpanNames.h>
namespace xrpl {
namespace telemetry {
namespace peer_span {
// ===== Span operation suffixes ===============================================
namespace op {
inline constexpr auto proposalReceive = makeStr("proposal.receive");
inline constexpr auto validationReceive = makeStr("validation.receive");
} // namespace op
// ===== Attribute keys ========================================================
namespace attr {
inline constexpr auto xrplPeer = join(seg::xrpl, seg::peer);
/// "xrpl.peer.id"
inline constexpr auto id = join(xrplPeer, makeStr("id"));
/// "xrpl.peer.proposal.trusted"
inline constexpr auto proposalTrusted =
join(join(xrplPeer, makeStr("proposal")), makeStr("trusted"));
/// "xrpl.peer.validation.ledger_hash"
inline constexpr auto validationLedgerHash =
join(join(xrplPeer, makeStr("validation")), makeStr("ledger_hash"));
/// "xrpl.peer.validation.full"
inline constexpr auto validationFull = join(join(xrplPeer, makeStr("validation")), makeStr("full"));
/// "xrpl.peer.validation.trusted"
inline constexpr auto validationTrusted =
join(join(xrplPeer, makeStr("validation")), makeStr("trusted"));
} // namespace attr
} // namespace peer_span
} // namespace telemetry
} // namespace xrpl