Compare commits

..

30 Commits

Author SHA1 Message Date
Ed Hennis
bf0b10404d Fix formatting 2026-01-28 19:40:27 -05:00
Ed Hennis
d019ebaf36 Merge branch 'develop' into ximinez/fix/validator-cache 2026-01-28 18:44:52 -04:00
Ed Hennis
b6e4620349 Merge branch 'develop' into ximinez/fix/validator-cache 2026-01-15 13:03:28 -04:00
Ed Hennis
db0ef6a370 Merge branch 'develop' into ximinez/fix/validator-cache 2026-01-15 12:05:56 -04:00
Ed Hennis
11a45a0ac2 Merge branch 'develop' into ximinez/fix/validator-cache 2026-01-13 18:19:08 -04:00
Ed Hennis
aa035f4cfd Merge branch 'develop' into ximinez/fix/validator-cache 2026-01-13 15:27:57 -04:00
Ed Hennis
8988f9117f Merge branch 'develop' into ximinez/fix/validator-cache 2026-01-12 14:52:12 -04:00
Ed Hennis
ae4f379845 Merge branch 'develop' into ximinez/fix/validator-cache 2026-01-11 00:50:40 -04:00
Ed Hennis
671aa11649 Merge branch 'develop' into ximinez/fix/validator-cache 2026-01-08 17:06:06 -04:00
Ed Hennis
53d35fd8ea Merge branch 'develop' into ximinez/fix/validator-cache 2026-01-08 13:04:16 -04:00
Ed Hennis
0c7ea2e333 Merge branch 'develop' into ximinez/fix/validator-cache 2026-01-06 14:02:10 -05:00
Ed Hennis
5f54be25e9 Merge branch 'develop' into ximinez/fix/validator-cache 2025-12-22 17:39:55 -05:00
Ed Hennis
d82756519c Merge branch 'develop' into ximinez/fix/validator-cache 2025-12-18 19:59:49 -05:00
Ed Hennis
1f23832659 Merge branch 'develop' into ximinez/fix/validator-cache 2025-12-12 20:34:55 -05:00
Ed Hennis
4c50969bde Merge branch 'develop' into ximinez/fix/validator-cache 2025-12-11 15:31:29 -05:00
Ed Hennis
aabdf372dd Merge branch 'develop' into ximinez/fix/validator-cache 2025-12-05 21:13:06 -05:00
Ed Hennis
c6d63a4b90 Merge branch 'develop' into ximinez/fix/validator-cache 2025-12-02 17:37:25 -05:00
Ed Hennis
1e6c3208db Merge branch 'develop' into ximinez/fix/validator-cache 2025-12-01 14:40:41 -05:00
Ed Hennis
a74f223efb Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-28 15:46:40 -05:00
Ed Hennis
1eb3a3ea5a Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-27 01:48:53 -05:00
Ed Hennis
630e428929 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-26 00:25:12 -05:00
Ed Hennis
3f93edc5e0 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-25 14:55:02 -05:00
Ed Hennis
baf62689ff Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-24 21:49:07 -05:00
Ed Hennis
ddf7d6cac4 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-24 21:30:18 -05:00
Ed Hennis
fcd2ea2d6e Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-21 12:47:54 -05:00
Ed Hennis
a16aa5b12f Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-18 22:39:25 -05:00
Ed Hennis
ef2de81870 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-15 03:08:38 -05:00
Ed Hennis
fce6757260 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-13 12:19:10 -05:00
Ed Hennis
d759a0a2b0 Merge branch 'develop' into ximinez/fix/validator-cache 2025-11-12 14:12:51 -05:00
Ed Hennis
d2dda416e8 Use Validator List (VL) cache files in more scenarios
- If any [validator_list_keys] are not available after all
  [validator_list_sites] have had a chance to be queried, then fall
  back to loading cache files. Currently, cache files are only used if
  no sites are defined, or the request to one of them has an error. It
  does not include cases where not enough sites are defined, or if a
  site returns an invalid VL (or something else entirely).
- Resolves #5320
2025-11-10 19:53:02 -05:00
7 changed files with 208 additions and 41 deletions

View File

@@ -940,7 +940,23 @@
#
# path Location to store the database
#
# Optional keys for NuDB and RocksDB:
# Optional keys
#
# cache_size Size of cache for database records. Default is 16384.
# Setting this value to 0 will use the default value.
#
# cache_age Length of time in minutes to keep database records
# cached. Default is 5 minutes. Setting this value to
# 0 will use the default value.
#
# Note: if neither cache_size nor cache_age is
# specified, the cache for database records will not
# be created. If only one of cache_size or cache_age
# is specified, the cache will be created using the
# default value for the unspecified parameter.
#
# Note: the cache will not be created if online_delete
# is specified.
#
# fast_load Boolean. If set, load the last persisted ledger
# from disk upon process start before syncing to
@@ -948,6 +964,8 @@
# if sufficient IOPS capacity is available.
# Default 0.
#
# Optional keys for NuDB or RocksDB:
#
# earliest_seq The default is 32570 to match the XRP ledger
# network's earliest allowed sequence. Alternate
# networks may set this value. Minimum value of 1.

View File

@@ -24,6 +24,32 @@ public:
beast::Journal j)
: Database(scheduler, readThreads, config, j), backend_(std::move(backend))
{
std::optional<int> cacheSize, cacheAge;
if (config.exists("cache_size"))
{
cacheSize = get<int>(config, "cache_size");
if (cacheSize.value() < 0)
{
Throw<std::runtime_error>("Specified negative value for cache_size");
}
}
if (config.exists("cache_age"))
{
cacheAge = get<int>(config, "cache_age");
if (cacheAge.value() < 0)
{
Throw<std::runtime_error>("Specified negative value for cache_age");
}
}
if (cacheSize != 0 || cacheAge != 0)
{
cache_ = std::make_shared<TaggedCache<uint256, NodeObject>>(
"DatabaseNodeImp", cacheSize.value_or(0), std::chrono::minutes(cacheAge.value_or(0)), stopwatch(), j);
}
XRPL_ASSERT(
backend_,
"xrpl::NodeStore::DatabaseNodeImp::DatabaseNodeImp : non-null "
@@ -82,6 +108,9 @@ public:
sweep() override;
private:
// Cache for database objects. This cache is not always initialized. Check
// for null before using.
std::shared_ptr<TaggedCache<uint256, NodeObject>> cache_;
// Persistent key/value storage
std::shared_ptr<Backend> backend_;

View File

@@ -10,6 +10,11 @@ DatabaseNodeImp::store(NodeObjectType type, Blob&& data, uint256 const& hash, st
auto obj = NodeObject::createObject(type, std::move(data), hash);
backend_->store(obj);
if (cache_)
{
// After the store, replace a negative cache entry if there is one
cache_->canonicalize(hash, obj, [](std::shared_ptr<NodeObject> const& n) { return n->getType() == hotDUMMY; });
}
}
void
@@ -18,41 +23,77 @@ DatabaseNodeImp::asyncFetch(
std::uint32_t ledgerSeq,
std::function<void(std::shared_ptr<NodeObject> const&)>&& callback)
{
if (cache_)
{
std::shared_ptr<NodeObject> obj = cache_->fetch(hash);
if (obj)
{
callback(obj->getType() == hotDUMMY ? nullptr : obj);
return;
}
}
Database::asyncFetch(hash, ledgerSeq, std::move(callback));
}
void
DatabaseNodeImp::sweep()
{
if (cache_)
cache_->sweep();
}
std::shared_ptr<NodeObject>
DatabaseNodeImp::fetchNodeObject(uint256 const& hash, std::uint32_t, FetchReport& fetchReport, bool duplicate)
{
std::shared_ptr<NodeObject> nodeObject = nullptr;
Status status;
std::shared_ptr<NodeObject> nodeObject = cache_ ? cache_->fetch(hash) : nullptr;
try
if (!nodeObject)
{
status = backend_->fetch(hash.data(), &nodeObject);
}
catch (std::exception const& e)
{
JLOG(j_.fatal()) << "fetchNodeObject " << hash << ": Exception fetching from backend: " << e.what();
Rethrow();
}
JLOG(j_.trace()) << "fetchNodeObject " << hash << ": record not " << (cache_ ? "cached" : "found");
switch (status)
Status status;
try
{
status = backend_->fetch(hash.data(), &nodeObject);
}
catch (std::exception const& e)
{
JLOG(j_.fatal()) << "fetchNodeObject " << hash << ": Exception fetching from backend: " << e.what();
Rethrow();
}
switch (status)
{
case ok:
if (cache_)
{
if (nodeObject)
cache_->canonicalize_replace_client(hash, nodeObject);
else
{
auto notFound = NodeObject::createObject(hotDUMMY, {}, hash);
cache_->canonicalize_replace_client(hash, notFound);
if (notFound->getType() != hotDUMMY)
nodeObject = notFound;
}
}
break;
case notFound:
break;
case dataCorrupt:
JLOG(j_.fatal()) << "fetchNodeObject " << hash << ": nodestore data is corrupted";
break;
default:
JLOG(j_.warn()) << "fetchNodeObject " << hash << ": backend returns unknown result " << status;
break;
}
}
else
{
case ok:
case notFound:
break;
case dataCorrupt:
JLOG(j_.fatal()) << "fetchNodeObject " << hash << ": nodestore data is corrupted";
break;
default:
JLOG(j_.warn()) << "fetchNodeObject " << hash << ": backend returns unknown result " << status;
break;
JLOG(j_.trace()) << "fetchNodeObject " << hash << ": record found in cache";
if (nodeObject->getType() == hotDUMMY)
nodeObject.reset();
}
if (nodeObject)
@@ -64,29 +105,66 @@ DatabaseNodeImp::fetchNodeObject(uint256 const& hash, std::uint32_t, FetchReport
std::vector<std::shared_ptr<NodeObject>>
DatabaseNodeImp::fetchBatch(std::vector<uint256> const& hashes)
{
std::vector<std::shared_ptr<NodeObject>> results{hashes.size()};
using namespace std::chrono;
auto const before = steady_clock::now();
std::vector<uint256 const*> batch{hashes.size()};
std::unordered_map<uint256 const*, size_t> indexMap;
std::vector<uint256 const*> cacheMisses;
uint64_t hits = 0;
uint64_t fetches = 0;
for (size_t i = 0; i < hashes.size(); ++i)
{
auto const& hash = hashes[i];
batch.push_back(&hash);
}
std::vector<std::shared_ptr<NodeObject>> results{hashes.size()};
results = backend_->fetchBatch(batch).first;
for (size_t i = 0; i < results.size(); ++i)
{
if (!results[i])
// See if the object already exists in the cache
auto nObj = cache_ ? cache_->fetch(hash) : nullptr;
++fetches;
if (!nObj)
{
JLOG(j_.error()) << "fetchBatch - "
<< "record not found in db. hash = " << strHex(hashes[i]);
// Try the database
indexMap[&hash] = i;
cacheMisses.push_back(&hash);
}
else
{
results[i] = nObj->getType() == hotDUMMY ? nullptr : nObj;
// It was in the cache.
++hits;
}
}
JLOG(j_.debug()) << "fetchBatch - cache hits = " << (hashes.size() - cacheMisses.size())
<< " - cache misses = " << cacheMisses.size();
auto dbResults = backend_->fetchBatch(cacheMisses).first;
for (size_t i = 0; i < dbResults.size(); ++i)
{
auto nObj = std::move(dbResults[i]);
size_t index = indexMap[cacheMisses[i]];
auto const& hash = hashes[index];
if (nObj)
{
// Ensure all threads get the same object
if (cache_)
cache_->canonicalize_replace_client(hash, nObj);
}
else
{
JLOG(j_.error()) << "fetchBatch - "
<< "record not found in db or cache. hash = " << strHex(hash);
if (cache_)
{
auto notFound = NodeObject::createObject(hotDUMMY, {}, hash);
cache_->canonicalize_replace_client(hash, notFound);
if (notFound->getType() != hotDUMMY)
nObj = std::move(notFound);
}
}
results[index] = std::move(nObj);
}
auto fetchDurationUs = std::chrono::duration_cast<std::chrono::microseconds>(steady_clock::now() - before).count();
updateFetchMetrics(hashes.size(), 0, fetchDurationUs);
updateFetchMetrics(fetches, hits, fetchDurationUs);
return results;
}

View File

@@ -490,8 +490,19 @@ public:
Env env(*this, envconfig(onlineDelete));
/////////////////////////////////////////////////////////////
// Create NodeStore with two backends to allow online deletion of data.
// Normally, SHAMapStoreImp handles all these details.
// Create the backend. Normally, SHAMapStoreImp handles all these
// details
auto nscfg = env.app().config().section(ConfigSection::nodeDatabase());
// Provide default values:
if (!nscfg.exists("cache_size"))
nscfg.set(
"cache_size", std::to_string(env.app().config().getValueFor(SizedItem::treeCacheSize, std::nullopt)));
if (!nscfg.exists("cache_age"))
nscfg.set(
"cache_age", std::to_string(env.app().config().getValueFor(SizedItem::treeCacheAge, std::nullopt)));
NodeStoreScheduler scheduler(env.app().getJobQueue());
std::string const writableDb = "write";
@@ -499,8 +510,9 @@ public:
auto writableBackend = makeBackendRotating(env, scheduler, writableDb);
auto archiveBackend = makeBackendRotating(env, scheduler, archiveDb);
// Create NodeStore with two backends to allow online deletion of
// data
constexpr int readThreads = 4;
auto nscfg = env.app().config().section(ConfigSection::nodeDatabase());
auto dbr = std::make_unique<NodeStore::DatabaseRotatingImp>(
scheduler,
readThreads,

View File

@@ -130,6 +130,14 @@ std::unique_ptr<NodeStore::Database>
SHAMapStoreImp::makeNodeStore(int readThreads)
{
auto nscfg = app_.config().section(ConfigSection::nodeDatabase());
// Provide default values:
if (!nscfg.exists("cache_size"))
nscfg.set("cache_size", std::to_string(app_.config().getValueFor(SizedItem::treeCacheSize, std::nullopt)));
if (!nscfg.exists("cache_age"))
nscfg.set("cache_age", std::to_string(app_.config().getValueFor(SizedItem::treeCacheAge, std::nullopt)));
std::unique_ptr<NodeStore::Database> db;
if (deleteInterval_)
@@ -218,6 +226,8 @@ SHAMapStoreImp::run()
LedgerIndex lastRotated = state_db_.getState().lastRotated;
netOPs_ = &app_.getOPs();
ledgerMaster_ = &app_.getLedgerMaster();
fullBelowCache_ = &(*app_.getNodeFamily().getFullBelowCache());
treeNodeCache_ = &(*app_.getNodeFamily().getTreeNodeCache());
if (advisoryDelete_)
canDelete_ = state_db_.getCanDelete();
@@ -480,13 +490,16 @@ void
SHAMapStoreImp::clearCaches(LedgerIndex validatedSeq)
{
ledgerMaster_->clearLedgerCachePrior(validatedSeq);
fullBelowCache_->clear();
}
void
SHAMapStoreImp::freshenCaches()
{
freshenCache(*app_.getNodeFamily().getTreeNodeCache());
freshenCache(app_.getMasterTransaction().getCache());
if (freshenCache(*treeNodeCache_))
return;
if (freshenCache(app_.getMasterTransaction().getCache()))
return;
}
void

View File

@@ -94,6 +94,8 @@ private:
// as of run() or before
NetworkOPs* netOPs_ = nullptr;
LedgerMaster* ledgerMaster_ = nullptr;
FullBelowCache* fullBelowCache_ = nullptr;
TreeNodeCache* treeNodeCache_ = nullptr;
static constexpr auto nodeStoreName_ = "NodeStore";

View File

@@ -124,7 +124,11 @@ ValidatorSite::load(std::vector<std::string> const& siteURIs, std::lock_guard<st
{
try
{
sites_.emplace_back(uri);
// This is not super efficient, but it doesn't happen often.
bool found =
std::ranges::any_of(sites_, [&uri](auto const& site) { return site.loadedResource->uri == uri; });
if (!found)
sites_.emplace_back(uri);
}
catch (std::exception const& e)
{
@@ -183,6 +187,15 @@ ValidatorSite::stop()
void
ValidatorSite::setTimer(std::lock_guard<std::mutex> const& site_lock, std::lock_guard<std::mutex> const& state_lock)
{
if (!sites_.empty() && //
std::ranges::all_of(sites_, [](auto const& site) { return site.lastRefreshStatus.has_value(); }))
{
// If all of the sites have been handled at least once (including
// errors and timeouts), call missingSite, which will load the cache
// files for any lists that are still unavailable.
missingSite(site_lock);
}
auto next = std::min_element(
sites_.begin(), sites_.end(), [](Site const& a, Site const& b) { return a.nextRefresh < b.nextRefresh; });
@@ -285,12 +298,14 @@ ValidatorSite::onRequestTimeout(std::size_t siteIdx, error_code const& ec)
// processes a network error. Usually, this function runs first,
// but on extremely rare occasions, the response handler can run
// first, which will leave activeResource empty.
auto const& site = sites_[siteIdx];
auto& site = sites_[siteIdx];
if (site.activeResource)
JLOG(j_.warn()) << "Request for " << site.activeResource->uri << " took too long";
else
JLOG(j_.error()) << "Request took too long, but a response has "
"already been processed";
if (!site.lastRefreshStatus)
site.lastRefreshStatus.emplace(Site::Status{clock_type::now(), ListDisposition::invalid, "timeout"});
}
std::lock_guard lock_state{state_mutex_};