feat(telemetry): add xrpld_ledger_history_mismatch_total{reason} counter

LedgerHistory::handleMismatch() already classifies a built-vs-validated ledger
mismatch (prior ledger, close time, consensus tx set, same/different tx set),
but only bumped a single untyped beast::insight counter — the reason was
dropped. Fork diagnosis was therefore a log-grep exercise.

Add a labeled OTel counter so the mismatch reason is a queryable time series:
- MetricsRegistry: new ledgerHistoryMismatchCounter_ + incrementLedgerHistoryMismatch(reason)
- LedgerHistory: record one reason per classification branch (unknown,
  prior_ledger, close_time, consensus_txset, same_txset_diff_result,
  different_txset). Reaches MetricsRegistry via the existing app_ reference.

The existing beast::insight mismatchCounter_ is left intact.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-06-04 16:02:26 +01:00
parent d3955d3639
commit 7a509a01eb
3 changed files with 43 additions and 0 deletions

View File

@@ -4,6 +4,7 @@
#include <xrpld/app/ledger/LedgerToJson.h>
#include <xrpld/app/main/Application.h>
#include <xrpld/core/Config.h>
#include <xrpld/telemetry/MetricsRegistry.h>
#include <xrpl/basics/Log.h>
#include <xrpl/basics/base_uint.h>
@@ -323,11 +324,19 @@ LedgerHistory::handleMismatch(
auto builtLedger = getLedgerByHash(built);
auto validLedger = getLedgerByHash(valid);
// Records the classified mismatch reason as a labeled OTel counter so
// fork diagnosis is a queryable time series, not just a log grep.
auto recordReason = [this](std::string_view reason) {
if (auto* mr = app_.getMetricsRegistry())
mr->incrementLedgerHistoryMismatch(reason);
};
if (!builtLedger || !validLedger)
{
JLOG(j_.error()) << "MISMATCH cannot be analyzed:"
<< " builtLedger: " << to_string(built) << " -> " << builtLedger
<< " validLedger: " << to_string(valid) << " -> " << validLedger;
recordReason("unknown");
return;
}
@@ -349,6 +358,7 @@ LedgerHistory::handleMismatch(
if (builtLedger->header().parentHash != validLedger->header().parentHash)
{
JLOG(j_.error()) << "MISMATCH on prior ledger";
recordReason("prior_ledger");
return;
}
@@ -356,6 +366,7 @@ LedgerHistory::handleMismatch(
if (builtLedger->header().closeTime != validLedger->header().closeTime)
{
JLOG(j_.error()) << "MISMATCH on close time";
recordReason("close_time");
return;
}
@@ -366,6 +377,7 @@ LedgerHistory::handleMismatch(
JLOG(j_.error()) << "MISMATCH on consensus transaction set "
<< " built: " << to_string(*builtConsensusHash)
<< " validated: " << to_string(*validatedConsensusHash);
recordReason("consensus_txset");
}
else
JLOG(j_.error()) << "MISMATCH with same consensus transaction set: "
@@ -379,10 +391,14 @@ LedgerHistory::handleMismatch(
if (builtTx == validTx)
{
JLOG(j_.error()) << "MISMATCH with same " << builtTx.size() << " transactions";
recordReason("same_txset_diff_result");
}
else
{
JLOG(j_.error()) << "MISMATCH with " << builtTx.size() << " built and " << validTx.size()
<< " valid transactions.";
recordReason("different_txset");
}
JLOG(j_.error()) << "built\n" << getJson({*builtLedger, {}});
JLOG(j_.error()) << "valid\n" << getJson({*validLedger, {}});

View File

@@ -237,6 +237,9 @@ MetricsRegistry::start(std::string const& endpoint, std::string const& instanceI
meter_->CreateUInt64Counter("xrpld_state_changes_total", "Total operating mode changes");
jqTransOverflowCounter_ = meter_->CreateUInt64Counter(
"xrpld_jq_trans_overflow_total", "Total job queue transaction overflows");
ledgerHistoryMismatchCounter_ = meter_->CreateUInt64Counter(
"xrpld_ledger_history_mismatch_total",
"Total built-vs-validated ledger mismatches by reason");
validationAgreementsCounter_ = meter_->CreateUInt64Counter(
"xrpld_validation_agreements_total", "Total validation agreements");
validationMissedCounter_ =
@@ -1326,4 +1329,13 @@ MetricsRegistry::incrementJqTransOverflow()
#endif
}
void
MetricsRegistry::incrementLedgerHistoryMismatch(std::string_view reason)
{
#ifdef XRPL_ENABLE_TELEMETRY
if (enabled_ && ledgerHistoryMismatchCounter_)
ledgerHistoryMismatchCounter_->Add(1, {{"reason", std::string(reason)}});
#endif
}
} // namespace xrpl::telemetry

View File

@@ -349,6 +349,17 @@ public:
void
incrementJqTransOverflow();
/** Increment the ledger_history_mismatch_total counter for a reason.
Called from LedgerHistory::handleMismatch() once the mismatch has
been classified. The reason label turns fork diagnosis from a
log-grep into a queryable time series.
@param reason Classified mismatch cause (e.g. "prior_ledger",
"close_time", "consensus_txset", "same_txset_diff_result",
"unknown").
*/
void
incrementLedgerHistoryMismatch(std::string_view reason);
/** Access the validation agreement tracker.
Used by consensus and ledger hooks to record our validations and
network validations so the tracker can compute agreement percentages.
@@ -483,6 +494,10 @@ private:
/// Counter: xrpld_jq_trans_overflow_total — incremented on job queue transaction overflows.
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Counter<uint64_t>>
jqTransOverflowCounter_;
/// Counter: xrpld_ledger_history_mismatch_total{reason} — incremented per classified
/// built-vs-validated ledger mismatch.
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Counter<uint64_t>>
ledgerHistoryMismatchCounter_;
/// Counter: xrpld_validation_agreements_total — incremented by ValidationTracker on
/// agreement.
opentelemetry::nostd::unique_ptr<opentelemetry::metrics::Counter<uint64_t>>