Compare commits

..

12 Commits

Author SHA1 Message Date
Ed Hennis
a74f223efb Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-28 15:46:40 -05:00
Ed Hennis
1eb3a3ea5a Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-27 01:48:53 -05:00
Ed Hennis
630e428929 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-26 00:25:12 -05:00
Ed Hennis
3f93edc5e0 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-25 14:55:02 -05:00
Ed Hennis
baf62689ff Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-24 21:49:07 -05:00
Ed Hennis
ddf7d6cac4 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-24 21:30:18 -05:00
Ed Hennis
fcd2ea2d6e Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-21 12:47:54 -05:00
Ed Hennis
a16aa5b12f Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-18 22:39:25 -05:00
Ed Hennis
ef2de81870 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-15 03:08:38 -05:00
Ed Hennis
fce6757260 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-13 12:19:10 -05:00
Ed Hennis
d759a0a2b0 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-12 14:12:51 -05:00
Ed Hennis
d2dda416e8 Use Validator List (VL) cache files in more scenarios
- If any [validator_list_keys] are not available after all
  [validator_list_sites] have had a chance to be queried, then fall
  back to loading cache files. Currently, cache files are only used if
  no sites are defined, or the request to one of them has an error. It
  does not include cases where not enough sites are defined, or if a
  site returns an invalid VL (or something else entirely).
- Resolves #5320
2025-11-10 19:53:02 -05:00
10 changed files with 41 additions and 383 deletions

View File

@@ -1051,11 +1051,10 @@
# The online delete process checks periodically
# that rippled is still in sync with the network,
# and that the validated ledger is less than
# 'age_threshold_seconds' old, and that all
# recent ledgers are available. If not, then continue
# 'age_threshold_seconds' old. If not, then continue
# sleeping for this number of seconds and
# checking until healthy.
# Default is 1.
# Default is 5.
#
# Notes:
# The 'node_db' entry configures the primary, persistent storage.

View File

@@ -2,7 +2,6 @@
#include <test/jtx/Env.h>
#include <xrpld/app/ledger/LedgerMaster.h>
#include <xrpld/app/misc/SHAMapStore.h>
namespace ripple {
namespace test {
@@ -101,88 +100,6 @@ class LedgerMaster_test : public beast::unit_test::suite
}
}
void
testCompleteLedgerRange(FeatureBitset features)
{
// Note that this test is intentionally very similar to
// SHAMapStore_test::testLedgerGaps, but has a different
// focus.
testcase("Complete Ledger operations");
using namespace test::jtx;
auto const deleteInterval = 8;
Env env{*this, envconfig([](auto cfg) {
return online_delete(std::move(cfg), deleteInterval);
})};
auto const alice = Account("alice");
env.fund(XRP(1000), alice);
env.close();
auto& lm = env.app().getLedgerMaster();
LedgerIndex minSeq = 2;
LedgerIndex maxSeq = env.closed()->info().seq;
auto& store = env.app().getSHAMapStore();
store.rendezvous();
LedgerIndex lastRotated = store.getLastRotated();
BEAST_EXPECTS(maxSeq == 3, to_string(maxSeq));
BEAST_EXPECTS(
lm.getCompleteLedgers() == "2-3", lm.getCompleteLedgers());
BEAST_EXPECTS(lastRotated == 3, to_string(lastRotated));
BEAST_EXPECT(lm.missingFromCompleteLedgerRange(minSeq, maxSeq) == 0);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq + 1, maxSeq - 1) == 0);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq - 1, maxSeq + 1) == 2);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq - 2, maxSeq - 2) == 2);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq + 2, maxSeq + 2) == 2);
// Close enough ledgers to rotate a few times
for (int i = 0; i < 24; ++i)
{
for (int t = 0; t < 3; ++t)
{
env(noop(alice));
}
env.close();
store.rendezvous();
++maxSeq;
if (maxSeq == lastRotated + deleteInterval)
{
minSeq = lastRotated;
lastRotated = maxSeq;
}
BEAST_EXPECTS(
env.closed()->info().seq == maxSeq,
to_string(env.closed()->info().seq));
BEAST_EXPECTS(
store.getLastRotated() == lastRotated,
to_string(store.getLastRotated()));
std::stringstream expectedRange;
expectedRange << minSeq << "-" << maxSeq;
BEAST_EXPECTS(
lm.getCompleteLedgers() == expectedRange.str(),
lm.getCompleteLedgers());
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq, maxSeq) == 0);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq + 1, maxSeq - 1) == 0);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq - 1, maxSeq + 1) == 2);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq - 2, maxSeq - 2) == 2);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq + 2, maxSeq + 2) == 2);
}
}
public:
void
run() override
@@ -196,7 +113,6 @@ public:
testWithFeats(FeatureBitset features)
{
testTxnIdFromIndex(features);
testCompleteLedgerRange(features);
}
};

View File

@@ -1,7 +1,6 @@
#include <test/jtx.h>
#include <test/jtx/envconfig.h>
#include <xrpld/app/ledger/LedgerMaster.h>
#include <xrpld/app/main/Application.h>
#include <xrpld/app/main/NodeStoreScheduler.h>
#include <xrpld/app/misc/SHAMapStore.h>
@@ -11,8 +10,6 @@
#include <xrpl/nodestore/detail/DatabaseRotatingImp.h>
#include <xrpl/protocol/jss.h>
#include <thread>
namespace ripple {
namespace test {
@@ -23,8 +20,10 @@ class SHAMapStore_test : public beast::unit_test::suite
static auto
onlineDelete(std::unique_ptr<Config> cfg)
{
using namespace jtx;
return online_delete(std::move(cfg), deleteInterval);
cfg->LEDGER_HISTORY = deleteInterval;
auto& section = cfg->section(ConfigSection::nodeDatabase());
section.set("online_delete", std::to_string(deleteInterval));
return cfg;
}
static auto
@@ -627,184 +626,6 @@ public:
BEAST_EXPECT(dbr->getName() == "3");
}
void
testLedgerGaps()
{
// Note that this test is intentionally very similar to
// LedgerMaster_test::testCompleteLedgerRange, but has a different
// focus.
testcase("Wait for ledger gaps to fill in");
using namespace test::jtx;
Env env{*this, envconfig(onlineDelete)};
std::map<LedgerIndex, uint256> hashes;
auto failureMessage = [&](char const* label,
auto expected,
auto actual) {
std::stringstream ss;
ss << label << ": Expected: " << expected << ", Got: " << actual;
return ss.str();
};
auto const alice = Account("alice");
env.fund(XRP(1000), alice);
env.close();
auto& lm = env.app().getLedgerMaster();
LedgerIndex minSeq = 2;
LedgerIndex maxSeq = env.closed()->info().seq;
auto& store = env.app().getSHAMapStore();
store.rendezvous();
LedgerIndex lastRotated = store.getLastRotated();
BEAST_EXPECTS(maxSeq == 3, to_string(maxSeq));
BEAST_EXPECTS(
lm.getCompleteLedgers() == "2-3", lm.getCompleteLedgers());
BEAST_EXPECTS(lastRotated == 3, to_string(lastRotated));
BEAST_EXPECT(lm.missingFromCompleteLedgerRange(minSeq, maxSeq) == 0);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq + 1, maxSeq - 1) == 0);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq - 1, maxSeq + 1) == 2);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq - 2, maxSeq - 2) == 2);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq + 2, maxSeq + 2) == 2);
// Close enough ledgers to rotate a few times
while (maxSeq < 20)
{
for (int t = 0; t < 3; ++t)
{
env(noop(alice));
}
env.close();
store.rendezvous();
++maxSeq;
if (maxSeq + 1 == lastRotated + deleteInterval)
{
using namespace std::chrono_literals;
// The next ledger will trigger a rotation. Delete the
// current ledger from LedgerMaster.
std::this_thread::sleep_for(100ms);
LedgerIndex const deleteSeq = maxSeq;
while (!lm.haveLedger(deleteSeq))
{
std::this_thread::sleep_for(100ms);
}
lm.clearLedger(deleteSeq);
auto expectedRange =
[](auto minSeq, auto deleteSeq, auto maxSeq) {
std::stringstream expectedRange;
expectedRange << minSeq << "-" << (deleteSeq - 1);
if (deleteSeq + 1 == maxSeq)
expectedRange << "," << maxSeq;
else if (deleteSeq < maxSeq)
expectedRange << "," << (deleteSeq + 1) << "-"
<< maxSeq;
return expectedRange.str();
};
BEAST_EXPECTS(
lm.getCompleteLedgers() ==
expectedRange(minSeq, deleteSeq, maxSeq),
failureMessage(
"Complete ledgers",
expectedRange(minSeq, deleteSeq, maxSeq),
lm.getCompleteLedgers()));
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq, maxSeq) == 1);
// Close another ledger, which will trigger a rotation, but the
// rotation will be stuck until the missing ledger is filled in.
env.close();
// DO NOT CALL rendezvous()! You'll end up with a deadlock.
++maxSeq;
// Nothing has changed
BEAST_EXPECTS(
store.getLastRotated() == lastRotated,
failureMessage(
"lastRotated", lastRotated, store.getLastRotated()));
BEAST_EXPECTS(
lm.getCompleteLedgers() ==
expectedRange(minSeq, deleteSeq, maxSeq),
failureMessage(
"Complete ledgers",
expectedRange(minSeq, deleteSeq, maxSeq),
lm.getCompleteLedgers()));
// Close 5 more ledgers, waiting one second in between to
// simulate the ledger making progress while online delete waits
// for the missing ledger to be filled in.
// This ensures the healthWait check has time to run and
// detect the gap.
for (int l = 0; l < 5; ++l)
{
env.close();
// DO NOT CALL rendezvous()! You'll end up with a deadlock.
++maxSeq;
// Nothing has changed
BEAST_EXPECTS(
store.getLastRotated() == lastRotated,
failureMessage(
"lastRotated",
lastRotated,
store.getLastRotated()));
BEAST_EXPECTS(
lm.getCompleteLedgers() ==
expectedRange(minSeq, deleteSeq, maxSeq),
failureMessage(
"Complete Ledgers",
expectedRange(minSeq, deleteSeq, maxSeq),
lm.getCompleteLedgers()));
std::this_thread::sleep_for(1s);
}
// Put the missing ledger back in LedgerMaster
lm.setLedgerRangePresent(deleteSeq, deleteSeq);
// Wait for the rotation to finish
store.rendezvous();
minSeq = lastRotated;
lastRotated = deleteSeq + 1;
}
BEAST_EXPECT(maxSeq != lastRotated + deleteInterval);
BEAST_EXPECTS(
env.closed()->info().seq == maxSeq,
failureMessage("maxSeq", maxSeq, env.closed()->info().seq));
BEAST_EXPECTS(
store.getLastRotated() == lastRotated,
failureMessage(
"lastRotated", lastRotated, store.getLastRotated()));
std::stringstream expectedRange;
expectedRange << minSeq << "-" << maxSeq;
BEAST_EXPECTS(
lm.getCompleteLedgers() == expectedRange.str(),
failureMessage(
"CompleteLedgers",
expectedRange.str(),
lm.getCompleteLedgers()));
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq, maxSeq) == 0);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq + 1, maxSeq - 1) == 0);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq - 1, maxSeq + 1) == 2);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq - 2, maxSeq - 2) == 2);
BEAST_EXPECT(
lm.missingFromCompleteLedgerRange(minSeq + 2, maxSeq + 2) == 2);
}
}
void
run() override
{
@@ -812,7 +633,6 @@ public:
testAutomatic();
testCanDelete();
testRotate();
testLedgerGaps();
}
};

View File

@@ -58,17 +58,6 @@ envconfig(F&& modfunc, Args&&... args)
return modfunc(envconfig(), std::forward<Args>(args)...);
}
/// @brief adjust config to enable online_delete
///
/// @param cfg config instance to be modified
///
/// @param deleteInterval how many new ledgers should be available before
/// rotating. Defaults to 8, because the standalone minimum is 8.
///
/// @return unique_ptr to Config instance
std::unique_ptr<Config>
online_delete(std::unique_ptr<Config> cfg, std::uint32_t deleteInterval = 8);
/// @brief adjust config so no admin ports are enabled
///
/// this is intended for use with envconfig, as in

View File

@@ -53,15 +53,6 @@ setupConfigForUnitTests(Config& cfg)
namespace jtx {
std::unique_ptr<Config>
online_delete(std::unique_ptr<Config> cfg, std::uint32_t deleteInterval)
{
cfg->LEDGER_HISTORY = deleteInterval;
auto& section = cfg->section(ConfigSection::nodeDatabase());
section.set("online_delete", std::to_string(deleteInterval));
return cfg;
}
std::unique_ptr<Config>
no_admin(std::unique_ptr<Config> cfg)
{

View File

@@ -108,10 +108,7 @@ public:
failedSave(std::uint32_t seq, uint256 const& hash);
std::string
getCompleteLedgers() const;
std::size_t
missingFromCompleteLedgerRange(LedgerIndex first, LedgerIndex last) const;
getCompleteLedgers();
/** Apply held transactions to the open ledger
This is normally called as we close the ledger.
@@ -328,7 +325,7 @@ private:
// A set of transactions to replay during the next close
std::unique_ptr<LedgerReplay> replayData;
std::recursive_mutex mutable mCompleteLock;
std::recursive_mutex mCompleteLock;
RangeSet<std::uint32_t> mCompleteLedgers;
// Publish thread is running.

View File

@@ -1571,36 +1571,12 @@ LedgerMaster::getPublishedLedger()
}
std::string
LedgerMaster::getCompleteLedgers() const
LedgerMaster::getCompleteLedgers()
{
std::lock_guard sl(mCompleteLock);
return to_string(mCompleteLedgers);
}
std::size_t
LedgerMaster::missingFromCompleteLedgerRange(
LedgerIndex first,
LedgerIndex last) const
{
// Make a copy of the range to avoid holding the lock
auto const range = [&] {
std::lock_guard sl(mCompleteLock);
return mCompleteLedgers;
}();
std::size_t missing = 0;
for (LedgerIndex idx = first; idx <= last; ++idx)
{
if (!boost::icl::contains(range, idx))
{
++missing;
}
}
return missing;
}
std::optional<NetClock::time_point>
LedgerMaster::getCloseTimeBySeq(LedgerIndex ledgerIndex)
{

View File

@@ -289,18 +289,6 @@ SHAMapStoreImp::run()
validatedSeq >= lastRotated + deleteInterval_ &&
canDelete_ >= lastRotated - 1 && healthWait() == keepGoing;
JLOG(journal_.debug())
<< "run: Setting lastGoodValidatedLedger_ to " << validatedSeq;
{
// Note that this is set after the healthWait() check, so that we
// don't start the rotation until the validated ledger is fully
// processed. It is not guaranteed to be done at this point. It also
// allows the testLedgerGaps unit test to work.
std::unique_lock<std::mutex> lock(mutex_);
lastGoodValidatedLedger_ = validatedSeq;
}
// will delete up to (not including) lastRotated
if (readyToRotate)
{
@@ -309,9 +297,7 @@ SHAMapStoreImp::run()
<< lastRotated << " deleteInterval " << deleteInterval_
<< " canDelete_ " << canDelete_ << " state "
<< app_.getOPs().strOperatingMode(false) << " age "
<< ledgerMaster_->getValidatedLedgerAge().count()
<< "s. Complete ledgers: "
<< ledgerMaster_->getCompleteLedgers();
<< ledgerMaster_->getValidatedLedgerAge().count() << 's';
clearPrior(lastRotated);
if (healthWait() == stopping)
@@ -374,10 +360,7 @@ SHAMapStoreImp::run()
clearCaches(validatedSeq);
});
JLOG(journal_.warn())
<< "finished rotation. validatedSeq: " << validatedSeq
<< ", lastRotated: " << lastRotated << ". Complete ledgers: "
<< ledgerMaster_->getCompleteLedgers();
JLOG(journal_.warn()) << "finished rotation " << validatedSeq;
}
}
}
@@ -632,47 +615,22 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
SHAMapStoreImp::HealthResult
SHAMapStoreImp::healthWait()
{
auto index = ledgerMaster_->getValidLedgerIndex();
auto age = ledgerMaster_->getValidatedLedgerAge();
OperatingMode mode = netOPs_->getOperatingMode();
std::unique_lock lock(mutex_);
auto numMissing = ledgerMaster_->missingFromCompleteLedgerRange(
lastGoodValidatedLedger_, index);
while (
!stop_ &&
(mode != OperatingMode::FULL || age > ageThreshold_ || numMissing > 0))
while (!stop_ && (mode != OperatingMode::FULL || age > ageThreshold_))
{
// this value shouldn't change, so grab it while we have the
// lock
auto const lastGood = lastGoodValidatedLedger_;
lock.unlock();
auto const stream = mode != OperatingMode::FULL || age > ageThreshold_
? journal_.warn()
: journal_.info();
JLOG(stream) << "Waiting " << recoveryWaitTime_.count()
<< "s for node to stabilize. state: "
<< app_.getOPs().strOperatingMode(mode, false) << ". age "
<< age.count() << "s. Missing ledgers: " << numMissing
<< ". Expect: " << lastGood << "-" << index
<< ". Complete ledgers: "
<< ledgerMaster_->getCompleteLedgers();
JLOG(journal_.warn()) << "Waiting " << recoveryWaitTime_.count()
<< "s for node to stabilize. state: "
<< app_.getOPs().strOperatingMode(mode, false)
<< ". age " << age.count() << 's';
std::this_thread::sleep_for(recoveryWaitTime_);
index = ledgerMaster_->getValidLedgerIndex();
age = ledgerMaster_->getValidatedLedgerAge();
mode = netOPs_->getOperatingMode();
numMissing =
ledgerMaster_->missingFromCompleteLedgerRange(lastGood, index);
lock.lock();
}
JLOG(journal_.debug()) << "healthWait: Setting lastGoodValidatedLedger_ to "
<< index;
lastGoodValidatedLedger_ = index;
return stop_ ? stopping : keepGoing;
}

View File

@@ -71,11 +71,6 @@ private:
std::thread thread_;
bool stop_ = false;
bool healthy_ = true;
// Used to prevent ledger gaps from forming during online deletion. Keeps
// track of the last validated ledger that was processed without gaps. There
// are no guarantees about gaps while online delete is not running. For
// that, use advisory_delete and check for gaps externally.
LedgerIndex lastGoodValidatedLedger_ = 0;
mutable std::condition_variable cond_;
mutable std::condition_variable rendezvous_;
mutable std::mutex mutex_;
@@ -89,11 +84,11 @@ private:
std::uint32_t deleteBatch_ = 100;
std::chrono::milliseconds backOff_{100};
std::chrono::seconds ageThreshold_{60};
/// If the node is out of sync, or any recent ledgers are not
/// available during an online_delete healthWait() call, sleep
/// the thread for this time, and continue checking until recovery.
/// If the node is out of sync during an online_delete healthWait()
/// call, sleep the thread for this time, and continue checking until
/// recovery.
/// See also: "recovery_wait_seconds" in rippled-example.cfg
std::chrono::seconds recoveryWaitTime_{1};
std::chrono::seconds recoveryWaitTime_{5};
// these do not exist upon SHAMapStore creation, but do exist
// as of run() or before
@@ -217,8 +212,6 @@ private:
enum HealthResult { stopping, keepGoing };
[[nodiscard]] HealthResult
healthWait();
bool
hasCompleteRange(LedgerIndex first, LedgerIndex last);
public:
void

View File

@@ -129,7 +129,12 @@ ValidatorSite::load(
{
try
{
sites_.emplace_back(uri);
// This is not super efficient, but it doesn't happen often.
bool found = std::ranges::any_of(sites_, [&uri](auto const& site) {
return site.loadedResource->uri == uri;
});
if (!found)
sites_.emplace_back(uri);
}
catch (std::exception const& e)
{
@@ -191,6 +196,17 @@ ValidatorSite::setTimer(
std::lock_guard<std::mutex> const& site_lock,
std::lock_guard<std::mutex> const& state_lock)
{
if (!sites_.empty() && //
std::ranges::all_of(sites_, [](auto const& site) {
return site.lastRefreshStatus.has_value();
}))
{
// If all of the sites have been handled at least once (including
// errors and timeouts), call missingSite, which will load the cache
// files for any lists that are still unavailable.
missingSite(site_lock);
}
auto next = std::min_element(
sites_.begin(), sites_.end(), [](Site const& a, Site const& b) {
return a.nextRefresh < b.nextRefresh;
@@ -303,13 +319,16 @@ ValidatorSite::onRequestTimeout(std::size_t siteIdx, error_code const& ec)
// processes a network error. Usually, this function runs first,
// but on extremely rare occasions, the response handler can run
// first, which will leave activeResource empty.
auto const& site = sites_[siteIdx];
auto& site = sites_[siteIdx];
if (site.activeResource)
JLOG(j_.warn()) << "Request for " << site.activeResource->uri
<< " took too long";
else
JLOG(j_.error()) << "Request took too long, but a response has "
"already been processed";
if (!site.lastRefreshStatus)
site.lastRefreshStatus.emplace(Site::Status{
clock_type::now(), ListDisposition::invalid, "timeout"});
}
std::lock_guard lock_state{state_mutex_};