NuDB: Performance improvements (RIPD-793,796):

This introduces changes in nudb to improve speed, reduce database size, and enhance correctness. The most significant change is to store hashes rather than entire keys in the key file. The output of the hash function is reduced to 48 bits, and stored directly in buckets. The API is also modified to introduce a Codec parameter allowing for compression and decompression to be supported in the implementation itself rather than callers. THe data file no longer contains a salt, as the salt is applicable only to the key and log files. This allows a data file to have multiple key files with different salt values. To distinguish physical files belonging to the same logical database, a new field UID is introduced. The UID is a 64-bit random value generated once on creation and stored in all three files. Buckets are zero filled to the end of each block, this is a security measure to prevent unintended contents of memory getting stored to disk. NuDB offers the varint integer type, this is identical to the varint described by Google. * Add varint * Add Codec template argument * Add "api" convenience traits * Store hash in buckets * istream can throw short read errors * Support std::uint8_t format in streams * Make file classes part of the public interface * Remove buffers pessimization, replace with buffer * Consolidate creation utility functions to the same header * Zero fill unused areas of buckets on disk * More coverage and improvements to the recover test * Fix file read/write to loop until all bytes processed * Add verify_fast, faster verify for large databases The database version number is incremented to 2; older databases can no longer be opened and should be deleted.
2025-12-06 17:27:55 +00:00 · 2015-02-03 07:46:24 -08:00
parent 62c5b5e570
commit e2a5535ed6
37 changed files with 2098 additions and 1300 deletions
--- a/src/ripple/nodestore/backend/NuDBFactory.cpp
+++ b/src/ripple/nodestore/backend/NuDBFactory.cpp
@@ -24,7 +24,10 @@
 #include <ripple/nodestore/impl/DecodedBlob.h>
 #include <ripple/nodestore/impl/EncodedBlob.h>
 #include <beast/nudb.h>
+#include <beast/nudb/detail/bucket.h> // remove asap
+#include <beast/nudb/identity_codec.h>
 #include <beast/nudb/visit.h>
+#include <beast/hash/xxhasher.h>
 #include <snappy.h>
 #include <boost/filesystem.hpp>
 #include <cassert>
@@ -61,10 +64,13 @@ public:
        currentType = typeTwo
    };

+    using api = beast::nudb::api<
+        beast::xxhasher, beast::nudb::identity_codec>;
+
    beast::Journal journal_;
    size_t const keyBytes_;
    std::string const name_;
-    beast::nudb::store db_;
+    api::store db_;
    std::atomic <bool> deletePath_;
    Scheduler& scheduler_;

@@ -85,7 +91,7 @@ public:
        auto const kp = (folder / "nudb.key").string ();
        auto const lp = (folder / "nudb.log").string ();
        using beast::nudb::make_salt;
-        beast::nudb::create (dp, kp, lp,
+        api::create (dp, kp, lp,
            currentType, make_salt(), keyBytes,
                beast::nudb::block_size(kp),
            0.50);
@@ -200,22 +206,24 @@ public:
    fetch1 (void const* key,
        std::shared_ptr <NodeObject>* pno)
    {
+        Status status;
        pno->reset();
-        std::size_t bytes;
-        std::unique_ptr <std::uint8_t[]> data;
        if (! db_.fetch (key,
-            [&data, &bytes](std::size_t n)
+            [key, pno, &status](void const* data, std::size_t size)
            {
-                bytes = n;
-                data.reset(new std::uint8_t[bytes]);
-                return data.get();
+                DecodedBlob decoded (key, data, size);
+                if (! decoded.wasOk ())
+                {
+                    status = dataCorrupt;
+                    return;
+                }
+                *pno = decoded.createObject();
+                status = ok;
            }))
+        {
            return notFound;
-        DecodedBlob decoded (key, data.get(), bytes);
-        if (! decoded.wasOk ())
-            return dataCorrupt;
-        *pno = decoded.createObject();
-        return ok;
+        }
+        return status;
    }

    void
@@ -236,31 +244,35 @@ public:
    fetch2 (void const* key,
        std::shared_ptr <NodeObject>* pno)
    {
+        Status status;
        pno->reset();
-        std::size_t actual;
-        std::unique_ptr <char[]> compressed;
        if (! db_.fetch (key,
-            [&](std::size_t n)
+            [&](void const* data, std::size_t size)
            {
-                actual = n;
-                compressed.reset(
-                    new char[n]);
-                return compressed.get();
+                std::size_t actual;
+                if (! snappy::GetUncompressedLength(
+                        (char const*)data, size, &actual))
+                {
+                    status = dataCorrupt;
+                    return;
+                }
+                std::unique_ptr <char[]> buf (new char[actual]);
+                snappy::RawUncompress (
+                    (char const*)data, size, buf.get());
+                DecodedBlob decoded (key, buf.get(), actual);
+                if (! decoded.wasOk ())
+                {
+                    status = dataCorrupt;
+                    return;
+                }
+                *pno = decoded.createObject();
+                status = ok;
            }))
+        {
            return notFound;
-        std::size_t size;
-        if (! snappy::GetUncompressedLength(
-                (char const*)compressed.get(),
-                    actual, &size))
-            return dataCorrupt;
-        std::unique_ptr <char[]> data (new char[size]);
-        snappy::RawUncompress (compressed.get(),
-            actual, data.get());
-        DecodedBlob decoded (key, data.get(), size);
-        if (! decoded.wasOk ())
-            return dataCorrupt;
-        *pno = decoded.createObject();
-        return ok;
+        }
+
+        return status;
    }

    void
@@ -342,7 +354,7 @@ public:
        auto const lp = db_.log_path();
        auto const appnum = db_.appnum();
        db_.close();
-        beast::nudb::visit (dp,
+        api::visit (dp,
            [&](
                void const* key, std::size_t key_bytes,
                void const* data, std::size_t size)
@@ -399,7 +411,7 @@ public:
        auto const kp = db_.key_path();
        auto const lp = db_.log_path();
        db_.close();
-        beast::nudb::verify (dp, kp);
+        api::verify (dp, kp);
        db_.open (dp, kp, lp,
            arena_alloc_size);
    }