Compare commits

..

1 Commits

Author SHA1 Message Date
Bart
f586382622 perf: Improve IOPS when reading from and writing to NuDB and RocksDB 2026-03-17 17:56:59 -04:00
7 changed files with 306 additions and 47 deletions

View File

@@ -138,6 +138,9 @@ public:
/** Returns the number of file descriptors the backend expects to need. */
virtual int
fdRequired() const = 0;
/** The number of hardware threads to use for compression of a batch. */
static unsigned int const numHardwareThreads;
};
} // namespace NodeStore

View File

@@ -0,0 +1,18 @@
#include <xrpl/nodestore/Backend.h>
#include <algorithm>
#include <thread>
namespace xrpl {
namespace NodeStore {
// Initialize the static constant for hardware thread count. The `hardware_concurrency` function can
// return 0 on some platforms, in which case we default to 1. We limit the total number of threads
// to 8 to avoid contention.
unsigned int const Backend::numHardwareThreads = []() {
auto const hw = std::thread::hardware_concurrency();
return std::min(std::max(hw, 1u), 8u);
}();
} // namespace NodeStore
} // namespace xrpl

View File

@@ -7,15 +7,21 @@
#include <xrpl/nodestore/detail/EncodedBlob.h>
#include <xrpl/nodestore/detail/codec.h>
#include <boost/asio/post.hpp>
#include <boost/asio/thread_pool.hpp>
#include <boost/filesystem.hpp>
#include <nudb/nudb.hpp>
#include <atomic>
#include <chrono>
#include <cstdint>
#include <cstdio>
#include <exception>
#include <latch>
#include <memory>
#include <thread>
#include <vector>
namespace xrpl {
namespace NodeStore {
@@ -37,6 +43,7 @@ public:
nudb::store db_;
std::atomic<bool> deletePath_;
Scheduler& scheduler_;
boost::asio::thread_pool threadPool_;
NuDBBackend(
size_t keyBytes,
@@ -51,6 +58,7 @@ public:
, blockSize_(parseBlockSize(name_, keyValues, journal))
, deletePath_(false)
, scheduler_(scheduler)
, threadPool_(numHardwareThreads)
{
if (name_.empty())
Throw<std::runtime_error>("nodestore: Missing path in NuDB backend");
@@ -71,6 +79,7 @@ public:
, db_(context)
, deletePath_(false)
, scheduler_(scheduler)
, threadPool_(numHardwareThreads)
{
if (name_.empty())
Throw<std::runtime_error>("nodestore: Missing path in NuDB backend");
@@ -184,6 +193,7 @@ public:
Status status = ok;
pno->reset();
nudb::error_code ec;
db_.fetch(
hash.data(),
[&hash, pno, &status](void const* data, std::size_t size) {
@@ -199,6 +209,7 @@ public:
status = ok;
},
ec);
if (ec == nudb::error::key_not_found)
return notFound;
if (ec)
@@ -209,18 +220,62 @@ public:
std::pair<std::vector<std::shared_ptr<NodeObject>>, Status>
fetchBatch(std::vector<uint256> const& hashes) override
{
std::vector<std::shared_ptr<NodeObject>> results;
results.reserve(hashes.size());
for (auto const& h : hashes)
std::vector<std::shared_ptr<NodeObject>> results(hashes.size());
// Determine the number of threads to use for data compression from the number of available
// cores and the size of the batch. We would like each thread to at least process 4 items,
// except for the last thread that might process fewer items.
auto const numThreads = std::min(
std::max(static_cast<unsigned int>(hashes.size()) / 4u, 1u), numHardwareThreads);
// If we need only one thread, just do it sequentially.
if (numThreads == 1u)
{
std::shared_ptr<NodeObject> nObj;
Status status = fetch(h, &nObj);
if (status != ok)
results.push_back({});
else
results.push_back(nObj);
for (size_t i = 0; i < hashes.size(); ++i)
{
std::shared_ptr<NodeObject> nObj;
if (fetch(hashes[i], &nObj) == ok)
results[i] = nObj;
}
return {results, ok};
}
// Use a latch to synchronize task completion.
std::latch taskCompletion(numThreads);
// Submit fetch tasks to the thread pool.
auto const itemsPerThread = (hashes.size() + numThreads - 1) / numThreads;
for (unsigned int t = 0; t < numThreads; ++t)
{
auto const startIdx = t * itemsPerThread;
XRPL_ASSERT(
startIdx < hashes.size(),
"xrpl::NuDBFactory::fetchBatch : startIdx < hashes.size()");
if (startIdx >= hashes.size())
{
taskCompletion.count_down();
continue;
}
auto const endIdx = std::min(startIdx + itemsPerThread, hashes.size());
auto task = [this, &hashes, &results, &taskCompletion, startIdx, endIdx]() {
// Fetch the items assigned to this task.
for (size_t i = startIdx; i < endIdx; ++i)
{
std::shared_ptr<NodeObject> nObj;
if (fetch(hashes[i], &nObj) == ok)
results[i] = nObj;
}
// Signal task completion.
taskCompletion.count_down();
};
boost::asio::post(threadPool_, std::move(task));
}
// Wait for all fetch tasks to complete.
taskCompletion.wait();
return {results, ok};
}
@@ -228,9 +283,11 @@ public:
do_insert(std::shared_ptr<NodeObject> const& no)
{
EncodedBlob e(no);
nudb::error_code ec;
nudb::detail::buffer bf;
auto const result = nodeobject_compress(e.getData(), e.getSize(), bf);
nudb::error_code ec;
db_.insert(e.getKey(), result.first, result.second, ec);
if (ec && ec != nudb::error::key_exists)
Throw<nudb::system_error>(ec);
@@ -242,7 +299,11 @@ public:
BatchWriteReport report{};
report.writeCount = 1;
auto const start = std::chrono::steady_clock::now();
++pendingWrites_;
do_insert(no);
--pendingWrites_;
report.elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - start);
scheduler_.onBatchWrite(report);
@@ -254,8 +315,109 @@ public:
BatchWriteReport report{};
report.writeCount = batch.size();
auto const start = std::chrono::steady_clock::now();
for (auto const& e : batch)
do_insert(e);
pendingWrites_ += static_cast<int>(batch.size());
// Determine the number of threads to use for data compression from the number of available
// cores and the size of the batch. We would like each thread to at least process 4 items,
// except for the last thread that might process fewer items.
auto const numThreads = std::min(
std::max(static_cast<unsigned int>(batch.size()) / 4u, 1u), numHardwareThreads);
// If we need only one thread, just do it sequentially.
if (numThreads == 1u)
{
for (auto const& e : batch)
do_insert(e);
pendingWrites_ -= static_cast<int>(batch.size());
report.elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - start);
scheduler_.onBatchWrite(report);
return;
}
// Helper struct that stores actual item data, not pointers, to avoid dangling references
// after EncodedBlob and buffer go out of scope in the thread.
struct CompressedData
{
std::vector<std::uint8_t> key;
std::vector<std::uint8_t> data;
std::exception_ptr eptr;
};
std::vector<CompressedData> compressed(batch.size());
// Use a latch to synchronize task completion.
std::latch taskCompletion(numThreads);
// Submit compression tasks to the thread pool.
auto const itemsPerThread = (batch.size() + numThreads - 1) / numThreads;
for (unsigned int t = 0; t < numThreads; ++t)
{
auto const startIdx = t * itemsPerThread;
XRPL_ASSERT(
startIdx < batch.size(), "xrpl::NuDBFactory::storeBatch : startIdx < batch.size()");
if (startIdx >= batch.size())
{
taskCompletion.count_down();
continue;
}
auto const endIdx = std::min(startIdx + itemsPerThread, batch.size());
auto task =
[&batch, &compressed, &taskCompletion, startIdx, endIdx, keyBytes = keyBytes_]() {
// Compress the items assigned to this task.
for (size_t i = startIdx; i < endIdx; ++i)
{
auto& item = compressed[i];
try
{
EncodedBlob e(batch[i]);
// Copy the key data to avoid dangling pointer.
auto const* keyPtr = static_cast<std::uint8_t const*>(e.getKey());
item.key.assign(keyPtr, keyPtr + keyBytes);
// Compress and copy the data to avoid dangling pointer.
nudb::detail::buffer bf;
auto const comp = nodeobject_compress(e.getData(), e.getSize(), bf);
auto const* dataPtr = static_cast<std::uint8_t const*>(comp.first);
item.data.assign(dataPtr, dataPtr + comp.second);
}
catch (...)
{
// Store the exception so it can be rethrown in the sequential phase
// below.
item.eptr = std::current_exception();
}
}
// Signal task completion.
taskCompletion.count_down();
};
boost::asio::post(threadPool_, std::move(task));
}
// Wait for all compression tasks to complete.
taskCompletion.wait();
// Insert the compressed data sequentially, since NuDB is designed as an append-only data
// store that only supports one writer.
for (auto const& item : compressed)
{
if (item.eptr)
{
std::rethrow_exception(item.eptr);
}
nudb::error_code ec;
db_.insert(item.key.data(), item.data.data(), item.data.size(), ec);
if (ec && ec != nudb::error::key_exists)
Throw<nudb::system_error>(ec);
}
pendingWrites_ -= static_cast<int>(batch.size());
report.elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - start);
scheduler_.onBatchWrite(report);
@@ -272,7 +434,7 @@ public:
auto const dp = db_.dat_path();
auto const kp = db_.key_path();
auto const lp = db_.log_path();
// auto const appnum = db_.appnum();
nudb::error_code ec;
db_.close(ec);
if (ec)
@@ -306,7 +468,7 @@ public:
int
getWriteLoad() override
{
return 0;
return pendingWrites_.load();
}
void
@@ -341,6 +503,8 @@ public:
}
private:
std::atomic<int> pendingWrites_{0};
static std::size_t
parseBlockSize(std::string const& name, Section const& keyValues, beast::Journal journal)
{

View File

@@ -13,6 +13,7 @@
#include <atomic>
#include <memory>
#include <thread>
namespace xrpl {
namespace NodeStore {
@@ -181,6 +182,41 @@ public:
std::string("Unable to set RocksDB options: ") + s.ToString());
}
// Enable pipelined writes for better write concurrency.
m_options.enable_pipelined_write = true;
// Set background job parallelism for better compaction/flush performance to the number of
// hardware threads, unless the value is explicitly provided in the config. The default is
// 2 (see include/rocksdb/options.h in the Conan dependency directory), so don't use fewer
// than that.
if (auto v = get<unsigned int>(keyValues, "max_background_jobs", 0); v > 2)
{
m_options.max_background_jobs = v;
}
else if (v = numHardwareThreads; v > 2)
{
m_options.max_background_jobs = v;
}
// Set subcompactions for parallel compaction within a job to the number of hardware
// threads, unless the value is explicitly provided in the config. The default is 1 (see
// include/rocksdb/options.h in the Conan dependency directory), so don't use fewer
// than that if no value is explicitly provided.
if (auto v = get<unsigned int>(keyValues, "max_subcompactions", 0); v > 1)
{
m_options.max_subcompactions = v;
}
else if (v = numHardwareThreads / 2; v > 1)
{
m_options.max_subcompactions = v;
}
// Enable direct I/O by default unless explicitly disabled in the config. This bypasses the
// OS page cache for better predictable performance on SSDs.
m_options.use_direct_reads = get<bool>(keyValues, "use_direct_io", true);
m_options.use_direct_io_for_flush_and_compaction =
get<bool>(keyValues, "use_direct_io", true);
std::string s1, s2;
rocksdb::GetStringFromDBOptions(&s1, m_options, "; ");
rocksdb::GetStringFromColumnFamilyOptions(&s2, m_options, "; ");
@@ -253,23 +289,19 @@ public:
rocksdb::ReadOptions const options;
rocksdb::Slice const slice(std::bit_cast<char const*>(hash.data()), m_keyBytes);
std::string string;
rocksdb::Status getStatus = m_db->Get(options, slice, &string);
if (getStatus.ok())
{
DecodedBlob decoded(hash.data(), string.data(), string.size());
if (decoded.wasOk())
{
*pObject = decoded.createObject();
}
else
{
// Decoding failed, probably corrupted!
//
// Decoding failed, probably corrupted.
status = dataCorrupt;
}
}
@@ -286,7 +318,6 @@ public:
else
{
status = Status(customCode + unsafe_cast<int>(getStatus.code()));
JLOG(m_journal.error()) << getStatus.ToString();
}
}
@@ -297,16 +328,43 @@ public:
std::pair<std::vector<std::shared_ptr<NodeObject>>, Status>
fetchBatch(std::vector<uint256> const& hashes) override
{
std::vector<std::shared_ptr<NodeObject>> results;
results.reserve(hashes.size());
XRPL_ASSERT(m_db, "xrpl::NodeStore::RocksDBBackend::fetchBatch : non-null database");
if (hashes.empty())
return {{}, ok};
// Use MultiGet for parallel reads to allow RocksDB to fetch multiple keys concurrently,
// significantly improving throughput compared to sequential fetch() calls.
std::vector<rocksdb::Slice> keys;
keys.reserve(hashes.size());
for (auto const& h : hashes)
{
std::shared_ptr<NodeObject> nObj;
Status status = fetch(h, &nObj);
if (status != ok)
results.push_back({});
else
results.push_back(nObj);
keys.emplace_back(std::bit_cast<char const*>(h.data()), m_keyBytes);
}
rocksdb::ReadOptions options;
options.async_io = true; // Enable for better concurrency on supported platforms.
std::vector<std::string> values(hashes.size());
auto statuses = m_db->MultiGet(options, keys, &values);
std::vector<std::shared_ptr<NodeObject>> results(hashes.size());
for (auto i = 0; i < hashes.size(); ++i)
{
if (statuses[i].ok())
{
DecodedBlob decoded(hashes[i].data(), values[i].data(), values[i].size());
if (decoded.wasOk())
{
results[i] = decoded.createObject();
}
}
else if (!statuses[i].IsNotFound())
{
// Log other errors but continue processing.
JLOG(m_journal.warn()) << "fetchBatch: MultiGet error for key "
<< keys[i].ToString() << ": " << statuses[i].ToString();
}
}
return {results, ok};
@@ -321,10 +379,7 @@ public:
void
storeBatch(Batch const& batch) override
{
XRPL_ASSERT(
m_db,
"xrpl::NodeStore::RocksDBBackend::storeBatch : non-null "
"database");
XRPL_ASSERT(m_db, "xrpl::NodeStore::RocksDBBackend::storeBatch : non-null database");
rocksdb::WriteBatch wb;
for (auto const& e : batch)
@@ -336,7 +391,27 @@ public:
rocksdb::Slice(std::bit_cast<char const*>(encoded.getData()), encoded.getSize()));
}
rocksdb::WriteOptions const options;
// Configure WriteOptions for high throughput.
// Note: no_slowdown is intentionally NOT set here. When set to true, RocksDB returns an
// error instead of stalling when write buffers are full, which could cause write
// failures during high load. We prefer to accept brief stalls over dropped writes.
rocksdb::WriteOptions options;
// Setting `sync = false` improves write throughput significantly by allowing the OS to
// batch fsync operations, rather than forcing immediate disk synchronization on every
// write. The Write-Ahead Log (WAL) is still written and flushed, so database consistency is
// maintained across clean restarts and crashes.
//
// Note: On hard shutdown up to a few seconds of recent writes (since the last OS-initiated
// flush) may be lost from this node. However, since ledger data is replicated across
// the network, lost writes can be re-synced from peers during startup.
options.sync = false;
// Keep WAL enabled for crash recovery consistency.
options.disableWAL = false;
// Ensure RocksDB will not aggressive throttle the writes.
options.low_pri = false;
auto ret = m_db->Write(options, &wb);

View File

@@ -92,7 +92,7 @@ SetOracle::preclaim(PreclaimContext const& ctx)
return !v || *v == (*sle)[field];
};
std::int8_t adjustReserve = 0;
std::uint32_t adjustReserve = 0;
if (sle)
{
// update

View File

@@ -520,6 +520,13 @@ public:
srcParams.set("type", srcBackendType);
srcParams.set("path", node_db.path());
beast::temp_dir dest_db;
Section destParams;
destParams.set("type", destBackendType);
destParams.set("path", dest_db.path());
testcase("import into '" + destBackendType + "' from '" + srcBackendType + "'");
// Create a batch
auto batch = createPredictableBatch(numObjectsToTest, seedValue);
@@ -538,16 +545,9 @@ public:
Manager::instance().make_Database(megabytes(4), scheduler, 2, srcParams, journal_);
// Set up the destination database
beast::temp_dir dest_db;
Section destParams;
destParams.set("type", destBackendType);
destParams.set("path", dest_db.path());
std::unique_ptr<Database> dest =
Manager::instance().make_Database(megabytes(4), scheduler, 2, destParams, journal_);
testcase("import into '" + destBackendType + "' from '" + srcBackendType + "'");
// Do the import
dest->importDatabase(*src);

View File

@@ -218,12 +218,6 @@ doGetAggregatePrice(RPC::JsonContext& context)
return result;
}
// Get the ledger
std::shared_ptr<ReadView const> ledger;
result = RPC::lookupLedger(ledger, context);
if (!ledger)
return result; // LCOV_EXCL_LINE
// Collect the dataset into bimap keyed by lastUpdateTime and
// STAmount (Number is int64 and price is uint64)
Prices prices;
@@ -244,6 +238,11 @@ doGetAggregatePrice(RPC::JsonContext& context)
return result;
}
std::shared_ptr<ReadView const> ledger;
result = RPC::lookupLedger(ledger, context);
if (!ledger)
return result; // LCOV_EXCL_LINE
auto const sle = ledger->read(keylet::oracle(*account, *documentID));
iteratePriceData(context, sle, [&](STObject const& node) {
auto const& series = node.getFieldArray(sfPriceDataSeries);