perf: Improve IOPS when reading from and writing to NuDB and RocksDB

2026-03-18 02:32:26 +00:00 · 2026-03-17 17:56:59 -04:00
7 changed files with 306 additions and 47 deletions
--- a/include/xrpl/nodestore/Backend.h
+++ b/include/xrpl/nodestore/Backend.h
@@ -138,6 +138,9 @@ public:
    /** Returns the number of file descriptors the backend expects to need. */
    virtual int
    fdRequired() const = 0;
+
+    /** The number of hardware threads to use for compression of a batch. */
+    static unsigned int const numHardwareThreads;
 };

 }  // namespace NodeStore
--- a/src/libxrpl/nodestore/backend/Backend.cpp
+++ b/src/libxrpl/nodestore/backend/Backend.cpp
@@ -0,0 +1,18 @@
+#include <xrpl/nodestore/Backend.h>
+
+#include <algorithm>
+#include <thread>
+
+namespace xrpl {
+namespace NodeStore {
+
+// Initialize the static constant for hardware thread count. The `hardware_concurrency` function can
+// return 0 on some platforms, in which case we default to 1. We limit the total number of threads
+// to 8 to avoid contention.
+unsigned int const Backend::numHardwareThreads = []() {
+    auto const hw = std::thread::hardware_concurrency();
+    return std::min(std::max(hw, 1u), 8u);
+}();
+
+}  // namespace NodeStore
+}  // namespace xrpl
--- a/src/libxrpl/nodestore/backend/NuDBFactory.cpp
+++ b/src/libxrpl/nodestore/backend/NuDBFactory.cpp
@@ -7,15 +7,21 @@
 #include <xrpl/nodestore/detail/EncodedBlob.h>
 #include <xrpl/nodestore/detail/codec.h>

+#include <boost/asio/post.hpp>
+#include <boost/asio/thread_pool.hpp>
 #include <boost/filesystem.hpp>

 #include <nudb/nudb.hpp>

+#include <atomic>
 #include <chrono>
 #include <cstdint>
 #include <cstdio>
 #include <exception>
+#include <latch>
 #include <memory>
+#include <thread>
+#include <vector>

 namespace xrpl {
 namespace NodeStore {
@@ -37,6 +43,7 @@ public:
    nudb::store db_;
    std::atomic<bool> deletePath_;
    Scheduler& scheduler_;
+    boost::asio::thread_pool threadPool_;

    NuDBBackend(
        size_t keyBytes,
@@ -51,6 +58,7 @@ public:
        , blockSize_(parseBlockSize(name_, keyValues, journal))
        , deletePath_(false)
        , scheduler_(scheduler)
+        , threadPool_(numHardwareThreads)
    {
        if (name_.empty())
            Throw<std::runtime_error>("nodestore: Missing path in NuDB backend");
@@ -71,6 +79,7 @@ public:
        , db_(context)
        , deletePath_(false)
        , scheduler_(scheduler)
+        , threadPool_(numHardwareThreads)
    {
        if (name_.empty())
            Throw<std::runtime_error>("nodestore: Missing path in NuDB backend");
@@ -184,6 +193,7 @@ public:
        Status status = ok;
        pno->reset();
        nudb::error_code ec;
+
        db_.fetch(
            hash.data(),
            [&hash, pno, &status](void const* data, std::size_t size) {
@@ -199,6 +209,7 @@ public:
                status = ok;
            },
            ec);
+
        if (ec == nudb::error::key_not_found)
            return notFound;
        if (ec)
@@ -209,18 +220,62 @@ public:
    std::pair<std::vector<std::shared_ptr<NodeObject>>, Status>
    fetchBatch(std::vector<uint256> const& hashes) override
    {
-        std::vector<std::shared_ptr<NodeObject>> results;
-        results.reserve(hashes.size());
-        for (auto const& h : hashes)
+        std::vector<std::shared_ptr<NodeObject>> results(hashes.size());
+
+        // Determine the number of threads to use for data compression from the number of available
+        // cores and the size of the batch. We would like each thread to at least process 4 items,
+        // except for the last thread that might process fewer items.
+        auto const numThreads = std::min(
+            std::max(static_cast<unsigned int>(hashes.size()) / 4u, 1u), numHardwareThreads);
+
+        // If we need only one thread, just do it sequentially.
+        if (numThreads == 1u)
        {
-            std::shared_ptr<NodeObject> nObj;
-            Status status = fetch(h, &nObj);
-            if (status != ok)
-                results.push_back({});
-            else
-                results.push_back(nObj);
+            for (size_t i = 0; i < hashes.size(); ++i)
+            {
+                std::shared_ptr<NodeObject> nObj;
+                if (fetch(hashes[i], &nObj) == ok)
+                    results[i] = nObj;
+            }
+            return {results, ok};
        }

+        // Use a latch to synchronize task completion.
+        std::latch taskCompletion(numThreads);
+
+        // Submit fetch tasks to the thread pool.
+        auto const itemsPerThread = (hashes.size() + numThreads - 1) / numThreads;
+        for (unsigned int t = 0; t < numThreads; ++t)
+        {
+            auto const startIdx = t * itemsPerThread;
+            XRPL_ASSERT(
+                startIdx < hashes.size(),
+                "xrpl::NuDBFactory::fetchBatch : startIdx < hashes.size()");
+            if (startIdx >= hashes.size())
+            {
+                taskCompletion.count_down();
+                continue;
+            }
+            auto const endIdx = std::min(startIdx + itemsPerThread, hashes.size());
+
+            auto task = [this, &hashes, &results, &taskCompletion, startIdx, endIdx]() {
+                // Fetch the items assigned to this task.
+                for (size_t i = startIdx; i < endIdx; ++i)
+                {
+                    std::shared_ptr<NodeObject> nObj;
+                    if (fetch(hashes[i], &nObj) == ok)
+                        results[i] = nObj;
+                }
+                // Signal task completion.
+                taskCompletion.count_down();
+            };
+
+            boost::asio::post(threadPool_, std::move(task));
+        }
+
+        // Wait for all fetch tasks to complete.
+        taskCompletion.wait();
+
        return {results, ok};
    }

@@ -228,9 +283,11 @@ public:
    do_insert(std::shared_ptr<NodeObject> const& no)
    {
        EncodedBlob e(no);
-        nudb::error_code ec;
+
        nudb::detail::buffer bf;
        auto const result = nodeobject_compress(e.getData(), e.getSize(), bf);
+
+        nudb::error_code ec;
        db_.insert(e.getKey(), result.first, result.second, ec);
        if (ec && ec != nudb::error::key_exists)
            Throw<nudb::system_error>(ec);
@@ -242,7 +299,11 @@ public:
        BatchWriteReport report{};
        report.writeCount = 1;
        auto const start = std::chrono::steady_clock::now();
+
+        ++pendingWrites_;
        do_insert(no);
+        --pendingWrites_;
+
        report.elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
            std::chrono::steady_clock::now() - start);
        scheduler_.onBatchWrite(report);
@@ -254,8 +315,109 @@ public:
        BatchWriteReport report{};
        report.writeCount = batch.size();
        auto const start = std::chrono::steady_clock::now();
-        for (auto const& e : batch)
-            do_insert(e);
+
+        pendingWrites_ += static_cast<int>(batch.size());
+
+        // Determine the number of threads to use for data compression from the number of available
+        // cores and the size of the batch. We would like each thread to at least process 4 items,
+        // except for the last thread that might process fewer items.
+        auto const numThreads = std::min(
+            std::max(static_cast<unsigned int>(batch.size()) / 4u, 1u), numHardwareThreads);
+
+        // If we need only one thread, just do it sequentially.
+        if (numThreads == 1u)
+        {
+            for (auto const& e : batch)
+                do_insert(e);
+            pendingWrites_ -= static_cast<int>(batch.size());
+
+            report.elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+                std::chrono::steady_clock::now() - start);
+            scheduler_.onBatchWrite(report);
+            return;
+        }
+
+        // Helper struct that stores actual item data, not pointers, to avoid dangling references
+        // after EncodedBlob and buffer go out of scope in the thread.
+        struct CompressedData
+        {
+            std::vector<std::uint8_t> key;
+            std::vector<std::uint8_t> data;
+            std::exception_ptr eptr;
+        };
+        std::vector<CompressedData> compressed(batch.size());
+
+        // Use a latch to synchronize task completion.
+        std::latch taskCompletion(numThreads);
+
+        // Submit compression tasks to the thread pool.
+        auto const itemsPerThread = (batch.size() + numThreads - 1) / numThreads;
+        for (unsigned int t = 0; t < numThreads; ++t)
+        {
+            auto const startIdx = t * itemsPerThread;
+            XRPL_ASSERT(
+                startIdx < batch.size(), "xrpl::NuDBFactory::storeBatch : startIdx < batch.size()");
+            if (startIdx >= batch.size())
+            {
+                taskCompletion.count_down();
+                continue;
+            }
+            auto const endIdx = std::min(startIdx + itemsPerThread, batch.size());
+
+            auto task =
+                [&batch, &compressed, &taskCompletion, startIdx, endIdx, keyBytes = keyBytes_]() {
+                    // Compress the items assigned to this task.
+                    for (size_t i = startIdx; i < endIdx; ++i)
+                    {
+                        auto& item = compressed[i];
+                        try
+                        {
+                            EncodedBlob e(batch[i]);
+
+                            // Copy the key data to avoid dangling pointer.
+                            auto const* keyPtr = static_cast<std::uint8_t const*>(e.getKey());
+                            item.key.assign(keyPtr, keyPtr + keyBytes);
+
+                            // Compress and copy the data to avoid dangling pointer.
+                            nudb::detail::buffer bf;
+                            auto const comp = nodeobject_compress(e.getData(), e.getSize(), bf);
+                            auto const* dataPtr = static_cast<std::uint8_t const*>(comp.first);
+                            item.data.assign(dataPtr, dataPtr + comp.second);
+                        }
+                        catch (...)
+                        {
+                            // Store the exception so it can be rethrown in the sequential phase
+                            // below.
+                            item.eptr = std::current_exception();
+                        }
+                    }
+                    // Signal task completion.
+                    taskCompletion.count_down();
+                };
+
+            boost::asio::post(threadPool_, std::move(task));
+        }
+
+        // Wait for all compression tasks to complete.
+        taskCompletion.wait();
+
+        // Insert the compressed data sequentially, since NuDB is designed as an append-only data
+        // store that only supports one writer.
+        for (auto const& item : compressed)
+        {
+            if (item.eptr)
+            {
+                std::rethrow_exception(item.eptr);
+            }
+
+            nudb::error_code ec;
+            db_.insert(item.key.data(), item.data.data(), item.data.size(), ec);
+            if (ec && ec != nudb::error::key_exists)
+                Throw<nudb::system_error>(ec);
+        }
+
+        pendingWrites_ -= static_cast<int>(batch.size());
+
        report.elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
            std::chrono::steady_clock::now() - start);
        scheduler_.onBatchWrite(report);
@@ -272,7 +434,7 @@ public:
        auto const dp = db_.dat_path();
        auto const kp = db_.key_path();
        auto const lp = db_.log_path();
-        // auto const appnum = db_.appnum();
+
        nudb::error_code ec;
        db_.close(ec);
        if (ec)
@@ -306,7 +468,7 @@ public:
    int
    getWriteLoad() override
    {
-        return 0;
+        return pendingWrites_.load();
    }

    void
@@ -341,6 +503,8 @@ public:
    }

 private:
+    std::atomic<int> pendingWrites_{0};
+
    static std::size_t
    parseBlockSize(std::string const& name, Section const& keyValues, beast::Journal journal)
    {
--- a/src/libxrpl/nodestore/backend/RocksDBFactory.cpp
+++ b/src/libxrpl/nodestore/backend/RocksDBFactory.cpp
@@ -13,6 +13,7 @@

 #include <atomic>
 #include <memory>
+#include <thread>

 namespace xrpl {
 namespace NodeStore {
@@ -181,6 +182,41 @@ public:
                    std::string("Unable to set RocksDB options: ") + s.ToString());
        }

+        // Enable pipelined writes for better write concurrency.
+        m_options.enable_pipelined_write = true;
+
+        // Set background job parallelism for better compaction/flush performance to the number of
+        // hardware threads, unless the value is explicitly provided in the config. The default is
+        // 2 (see include/rocksdb/options.h in the Conan dependency directory), so don't use fewer
+        // than that.
+        if (auto v = get<unsigned int>(keyValues, "max_background_jobs", 0); v > 2)
+        {
+            m_options.max_background_jobs = v;
+        }
+        else if (v = numHardwareThreads; v > 2)
+        {
+            m_options.max_background_jobs = v;
+        }
+
+        // Set subcompactions for parallel compaction within a job to the number of hardware
+        // threads, unless the value is explicitly provided in the config. The default is 1 (see
+        // include/rocksdb/options.h in the Conan dependency directory), so don't use fewer
+        // than that if no value is explicitly provided.
+        if (auto v = get<unsigned int>(keyValues, "max_subcompactions", 0); v > 1)
+        {
+            m_options.max_subcompactions = v;
+        }
+        else if (v = numHardwareThreads / 2; v > 1)
+        {
+            m_options.max_subcompactions = v;
+        }
+
+        // Enable direct I/O by default unless explicitly disabled in the config. This bypasses the
+        // OS page cache for better predictable performance on SSDs.
+        m_options.use_direct_reads = get<bool>(keyValues, "use_direct_io", true);
+        m_options.use_direct_io_for_flush_and_compaction =
+            get<bool>(keyValues, "use_direct_io", true);
+
        std::string s1, s2;
        rocksdb::GetStringFromDBOptions(&s1, m_options, "; ");
        rocksdb::GetStringFromColumnFamilyOptions(&s2, m_options, "; ");
@@ -253,23 +289,19 @@ public:

        rocksdb::ReadOptions const options;
        rocksdb::Slice const slice(std::bit_cast<char const*>(hash.data()), m_keyBytes);
-
        std::string string;
-
        rocksdb::Status getStatus = m_db->Get(options, slice, &string);

        if (getStatus.ok())
        {
            DecodedBlob decoded(hash.data(), string.data(), string.size());
-
            if (decoded.wasOk())
            {
                *pObject = decoded.createObject();
            }
            else
            {
-                // Decoding failed, probably corrupted!
-                //
+                // Decoding failed, probably corrupted.
                status = dataCorrupt;
            }
        }
@@ -286,7 +318,6 @@ public:
            else
            {
                status = Status(customCode + unsafe_cast<int>(getStatus.code()));
-
                JLOG(m_journal.error()) << getStatus.ToString();
            }
        }
@@ -297,16 +328,43 @@ public:
    std::pair<std::vector<std::shared_ptr<NodeObject>>, Status>
    fetchBatch(std::vector<uint256> const& hashes) override
    {
-        std::vector<std::shared_ptr<NodeObject>> results;
-        results.reserve(hashes.size());
+        XRPL_ASSERT(m_db, "xrpl::NodeStore::RocksDBBackend::fetchBatch : non-null database");
+
+        if (hashes.empty())
+            return {{}, ok};
+
+        // Use MultiGet for parallel reads to allow RocksDB to fetch multiple keys concurrently,
+        // significantly improving throughput compared to sequential fetch() calls.
+
+        std::vector<rocksdb::Slice> keys;
+        keys.reserve(hashes.size());
        for (auto const& h : hashes)
        {
-            std::shared_ptr<NodeObject> nObj;
-            Status status = fetch(h, &nObj);
-            if (status != ok)
-                results.push_back({});
-            else
-                results.push_back(nObj);
+            keys.emplace_back(std::bit_cast<char const*>(h.data()), m_keyBytes);
+        }
+
+        rocksdb::ReadOptions options;
+        options.async_io = true;  // Enable for better concurrency on supported platforms.
+        std::vector<std::string> values(hashes.size());
+        auto statuses = m_db->MultiGet(options, keys, &values);
+
+        std::vector<std::shared_ptr<NodeObject>> results(hashes.size());
+        for (auto i = 0; i < hashes.size(); ++i)
+        {
+            if (statuses[i].ok())
+            {
+                DecodedBlob decoded(hashes[i].data(), values[i].data(), values[i].size());
+                if (decoded.wasOk())
+                {
+                    results[i] = decoded.createObject();
+                }
+            }
+            else if (!statuses[i].IsNotFound())
+            {
+                // Log other errors but continue processing.
+                JLOG(m_journal.warn()) << "fetchBatch: MultiGet error for key "
+                                       << keys[i].ToString() << ": " << statuses[i].ToString();
+            }
        }

        return {results, ok};
@@ -321,10 +379,7 @@ public:
    void
    storeBatch(Batch const& batch) override
    {
-        XRPL_ASSERT(
-            m_db,
-            "xrpl::NodeStore::RocksDBBackend::storeBatch : non-null "
-            "database");
+        XRPL_ASSERT(m_db, "xrpl::NodeStore::RocksDBBackend::storeBatch : non-null database");
        rocksdb::WriteBatch wb;

        for (auto const& e : batch)
@@ -336,7 +391,27 @@ public:
                rocksdb::Slice(std::bit_cast<char const*>(encoded.getData()), encoded.getSize()));
        }

-        rocksdb::WriteOptions const options;
+        // Configure WriteOptions for high throughput.
+        // Note: no_slowdown is intentionally NOT set here. When set to true, RocksDB returns an
+        //       error instead of stalling when write buffers are full, which could cause write
+        //       failures during high load. We prefer to accept brief stalls over dropped writes.
+        rocksdb::WriteOptions options;
+
+        // Setting `sync = false` improves write throughput significantly by allowing the OS to
+        // batch fsync operations, rather than forcing immediate disk synchronization on every
+        // write. The Write-Ahead Log (WAL) is still written and flushed, so database consistency is
+        // maintained across clean restarts and crashes.
+        //
+        // Note: On hard shutdown up to a few seconds of recent writes (since the last OS-initiated
+        //       flush) may be lost from this node. However, since ledger data is replicated across
+        //       the network, lost writes can be re-synced from peers during startup.
+        options.sync = false;
+
+        // Keep WAL enabled for crash recovery consistency.
+        options.disableWAL = false;
+
+        // Ensure RocksDB will not aggressive throttle the writes.
+        options.low_pri = false;

        auto ret = m_db->Write(options, &wb);

--- a/src/libxrpl/tx/transactors/oracle/SetOracle.cpp
+++ b/src/libxrpl/tx/transactors/oracle/SetOracle.cpp
@@ -92,7 +92,7 @@ SetOracle::preclaim(PreclaimContext const& ctx)
        return !v || *v == (*sle)[field];
    };

-    std::int8_t adjustReserve = 0;
+    std::uint32_t adjustReserve = 0;
    if (sle)
    {
        // update
--- a/src/test/nodestore/Database_test.cpp
+++ b/src/test/nodestore/Database_test.cpp
@@ -520,6 +520,13 @@ public:
        srcParams.set("type", srcBackendType);
        srcParams.set("path", node_db.path());

+        beast::temp_dir dest_db;
+        Section destParams;
+        destParams.set("type", destBackendType);
+        destParams.set("path", dest_db.path());
+
+        testcase("import into '" + destBackendType + "' from '" + srcBackendType + "'");
+
        // Create a batch
        auto batch = createPredictableBatch(numObjectsToTest, seedValue);

@@ -538,16 +545,9 @@ public:
                Manager::instance().make_Database(megabytes(4), scheduler, 2, srcParams, journal_);

            // Set up the destination database
-            beast::temp_dir dest_db;
-            Section destParams;
-            destParams.set("type", destBackendType);
-            destParams.set("path", dest_db.path());
-
            std::unique_ptr<Database> dest =
                Manager::instance().make_Database(megabytes(4), scheduler, 2, destParams, journal_);

-            testcase("import into '" + destBackendType + "' from '" + srcBackendType + "'");
-
            // Do the import
            dest->importDatabase(*src);

--- a/src/xrpld/rpc/handlers/GetAggregatePrice.cpp
+++ b/src/xrpld/rpc/handlers/GetAggregatePrice.cpp
@@ -218,12 +218,6 @@ doGetAggregatePrice(RPC::JsonContext& context)
        return result;
    }

-    // Get the ledger
-    std::shared_ptr<ReadView const> ledger;
-    result = RPC::lookupLedger(ledger, context);
-    if (!ledger)
-        return result;  // LCOV_EXCL_LINE
-
    // Collect the dataset into bimap keyed by lastUpdateTime and
    // STAmount (Number is int64 and price is uint64)
    Prices prices;
@@ -244,6 +238,11 @@ doGetAggregatePrice(RPC::JsonContext& context)
            return result;
        }

+        std::shared_ptr<ReadView const> ledger;
+        result = RPC::lookupLedger(ledger, context);
+        if (!ledger)
+            return result;  // LCOV_EXCL_LINE
+
        auto const sle = ledger->read(keylet::oracle(*account, *documentID));
        iteratePriceData(context, sle, [&](STObject const& node) {
            auto const& series = node.getFieldArray(sfPriceDataSeries);