mirror of
https://github.com/Xahau/xahaud.git
synced 2025-11-04 18:55:49 +00:00
Compare commits
4 Commits
memory-tes
...
snugdb
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9320f0fddc | ||
|
|
b0a7783dbc | ||
|
|
224e78ac81 | ||
|
|
fbfd8c1e0a |
@@ -542,6 +542,7 @@ target_sources (rippled PRIVATE
|
||||
src/ripple/nodestore/backend/NuDBFactory.cpp
|
||||
src/ripple/nodestore/backend/NullFactory.cpp
|
||||
src/ripple/nodestore/backend/RocksDBFactory.cpp
|
||||
src/ripple/nodestore/backend/SnugDBFactory.cpp
|
||||
src/ripple/nodestore/impl/BatchWriter.cpp
|
||||
src/ripple/nodestore/impl/Database.cpp
|
||||
src/ripple/nodestore/impl/DatabaseNodeImp.cpp
|
||||
|
||||
336
src/ripple/nodestore/backend/SnugDBFactory.cpp
Normal file
336
src/ripple/nodestore/backend/SnugDBFactory.cpp
Normal file
@@ -0,0 +1,336 @@
|
||||
//------------------------------------------------------------------------------
|
||||
/*
|
||||
This file is part of rippled: https://github.com/ripple/rippled
|
||||
Copyright (c) 2012, 2013 Ripple Labs Inc.
|
||||
|
||||
Permission to use, copy, modify, and/or distribute this software for any
|
||||
purpose with or without fee is hereby granted, provided that the above
|
||||
copyright notice and this permission notice appear in all copies.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
ANY SPECIAL , DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
//==============================================================================
|
||||
|
||||
#include <ripple/basics/contract.h>
|
||||
#include <ripple/nodestore/Factory.h>
|
||||
#include <ripple/nodestore/Manager.h>
|
||||
#include <ripple/nodestore/impl/DecodedBlob.h>
|
||||
#include <ripple/nodestore/impl/EncodedBlob.h>
|
||||
#include <ripple/nodestore/impl/codec.h>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include "snug.hpp"
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <exception>
|
||||
#include <memory>
|
||||
|
||||
namespace ripple {
|
||||
namespace NodeStore {
|
||||
|
||||
class SnugDBBackend : public Backend
|
||||
{
|
||||
private:
|
||||
static constexpr uint64_t BUFFER_SIZE =
|
||||
256ULL * 1024ULL * 1024ULL; // 256 Mib read buffer per thread
|
||||
public:
|
||||
beast::Journal const j_;
|
||||
std::string const name_;
|
||||
std::unique_ptr<snug::SnugDB> db_;
|
||||
Scheduler& scheduler_;
|
||||
|
||||
SnugDBBackend(
|
||||
Section const& keyValues,
|
||||
Scheduler& scheduler,
|
||||
beast::Journal journal)
|
||||
: j_(journal), name_(get(keyValues, "path")), scheduler_(scheduler)
|
||||
{
|
||||
if (name_.empty())
|
||||
throw std::runtime_error(
|
||||
"nodestore: Missing path in SnugDB backend");
|
||||
}
|
||||
|
||||
~SnugDBBackend() override
|
||||
{
|
||||
try
|
||||
{
|
||||
// close can throw and we don't want the destructor to throw.
|
||||
db_ = nullptr;
|
||||
}
|
||||
catch (std::exception const& e)
|
||||
{
|
||||
JLOG(j_.warn()) << "SnugDB threw on destruction: " << e.what();
|
||||
// Don't allow exceptions to propagate out of destructors.
|
||||
}
|
||||
}
|
||||
|
||||
std::string
|
||||
getName() override
|
||||
{
|
||||
return name_;
|
||||
}
|
||||
|
||||
void
|
||||
open(bool createIfMissing, uint64_t appType, uint64_t uid, uint64_t salt)
|
||||
override
|
||||
{
|
||||
if (db_)
|
||||
{
|
||||
assert(false);
|
||||
JLOG(j_.error()) << "database is already open";
|
||||
return;
|
||||
}
|
||||
|
||||
std::string path = name_ + "/" + std::to_string(uid) + "-" +
|
||||
std::to_string(appType) + "-" + std::to_string(salt);
|
||||
|
||||
boost::filesystem::create_directories(path);
|
||||
db_ = std::make_unique<snug::SnugDB>(path);
|
||||
}
|
||||
|
||||
bool
|
||||
isOpen() override
|
||||
{
|
||||
return db_ != nullptr;
|
||||
}
|
||||
|
||||
void
|
||||
open(bool createIfMissing) override
|
||||
{
|
||||
open(createIfMissing, 0, 0, 0);
|
||||
}
|
||||
|
||||
void
|
||||
close() override
|
||||
{
|
||||
db_ = nullptr;
|
||||
}
|
||||
|
||||
Status
|
||||
fetch(void const* key, std::shared_ptr<NodeObject>* pno) override
|
||||
{
|
||||
if (!db_)
|
||||
return backendError;
|
||||
|
||||
pno->reset();
|
||||
|
||||
static thread_local std::unique_ptr<uint8_t[]> thread_buffer =
|
||||
std::make_unique<uint8_t[]>(BUFFER_SIZE);
|
||||
|
||||
uint8_t* ptr = &(thread_buffer[0]);
|
||||
|
||||
uint64_t len = BUFFER_SIZE;
|
||||
int result = db_->read_entry(
|
||||
static_cast<uint8_t*>(const_cast<void*>(key)), ptr, &len);
|
||||
|
||||
if (0)
|
||||
{
|
||||
std::stringstream ss;
|
||||
const unsigned char* bytes = static_cast<const unsigned char*>(key);
|
||||
for (int i = 0; i < 32; ++i)
|
||||
{
|
||||
ss << std::setfill('0') << std::setw(2) << std::hex
|
||||
<< static_cast<int>(bytes[i]);
|
||||
}
|
||||
std::string key_hex = ss.str();
|
||||
|
||||
// Print the result using printf
|
||||
printf(
|
||||
"snug fetch: len=%zu result=%zu key=%s\n",
|
||||
len,
|
||||
result,
|
||||
key_hex.c_str());
|
||||
}
|
||||
|
||||
if (result == 1)
|
||||
return notFound;
|
||||
|
||||
if (result == 0)
|
||||
{
|
||||
DecodedBlob decoded(key, ptr, len);
|
||||
if (!decoded.wasOk())
|
||||
return dataCorrupt;
|
||||
|
||||
*pno = decoded.createObject();
|
||||
return ok;
|
||||
}
|
||||
|
||||
return backendError;
|
||||
}
|
||||
|
||||
std::pair<std::vector<std::shared_ptr<NodeObject>>, Status>
|
||||
fetchBatch(std::vector<uint256 const*> const& hashes) override
|
||||
{
|
||||
std::vector<std::shared_ptr<NodeObject>> results;
|
||||
results.reserve(hashes.size());
|
||||
for (auto const& h : hashes)
|
||||
{
|
||||
std::shared_ptr<NodeObject> nObj;
|
||||
Status status = fetch(h->begin(), &nObj);
|
||||
if (status != ok)
|
||||
results.push_back({});
|
||||
else
|
||||
results.push_back(nObj);
|
||||
}
|
||||
|
||||
return {results, ok};
|
||||
}
|
||||
|
||||
void
|
||||
do_insert(std::shared_ptr<NodeObject> const& no)
|
||||
{
|
||||
EncodedBlob e(no);
|
||||
|
||||
if (0)
|
||||
{
|
||||
std::stringstream ss;
|
||||
const unsigned char* bytes = static_cast<const unsigned char*>(
|
||||
const_cast<void*>(e.getKey()));
|
||||
for (int i = 0; i < 32; ++i)
|
||||
ss << std::setfill('0') << std::setw(2) << std::hex
|
||||
<< static_cast<int>(bytes[i]);
|
||||
std::string key_hex = ss.str();
|
||||
|
||||
std::cout << "snugdb write: len=" << e.getSize()
|
||||
<< ", key=" << key_hex << "\n";
|
||||
}
|
||||
int out = db_->write_entry(
|
||||
static_cast<uint8_t*>(const_cast<void*>(e.getKey())),
|
||||
static_cast<uint8_t*>(const_cast<void*>(e.getData())),
|
||||
e.getSize());
|
||||
if (out != 0)
|
||||
throw std::runtime_error(
|
||||
"SnugDB could not write entry. Disk full? error" +
|
||||
std::to_string(out));
|
||||
}
|
||||
|
||||
void
|
||||
store(std::shared_ptr<NodeObject> const& no) override
|
||||
{
|
||||
BatchWriteReport report;
|
||||
report.writeCount = 1;
|
||||
auto const start = std::chrono::steady_clock::now();
|
||||
do_insert(no);
|
||||
report.elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - start);
|
||||
scheduler_.onBatchWrite(report);
|
||||
}
|
||||
|
||||
void
|
||||
storeBatch(Batch const& batch) override
|
||||
{
|
||||
BatchWriteReport report;
|
||||
report.writeCount = batch.size();
|
||||
auto const start = std::chrono::steady_clock::now();
|
||||
for (auto const& e : batch)
|
||||
do_insert(e);
|
||||
report.elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - start);
|
||||
scheduler_.onBatchWrite(report);
|
||||
}
|
||||
|
||||
void
|
||||
sync() override
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
for_each(std::function<void(std::shared_ptr<NodeObject>)> f) override
|
||||
{
|
||||
db_->visit_all(
|
||||
[](uint8_t* key, uint8_t* data, uint64_t len, void* fp) -> void {
|
||||
DecodedBlob decoded(key, data, len);
|
||||
if (!decoded.wasOk())
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Missing or corrupted data in snugdb");
|
||||
return;
|
||||
}
|
||||
|
||||
std::function<void(std::shared_ptr<NodeObject>)> f =
|
||||
*(reinterpret_cast<
|
||||
std::function<void(std::shared_ptr<NodeObject>)>*>(fp));
|
||||
f(decoded.createObject());
|
||||
},
|
||||
reinterpret_cast<void*>(&f));
|
||||
}
|
||||
|
||||
int
|
||||
getWriteLoad() override
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
setDeletePath() override
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
verify() override
|
||||
{
|
||||
}
|
||||
|
||||
int
|
||||
fdRequired() const override
|
||||
{
|
||||
return 3;
|
||||
}
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
class SnugDBFactory : public Factory
|
||||
{
|
||||
public:
|
||||
SnugDBFactory()
|
||||
{
|
||||
Manager::instance().insert(*this);
|
||||
}
|
||||
|
||||
~SnugDBFactory() override
|
||||
{
|
||||
Manager::instance().erase(*this);
|
||||
}
|
||||
|
||||
std::string
|
||||
getName() const override
|
||||
{
|
||||
return "SnugDB";
|
||||
}
|
||||
|
||||
std::unique_ptr<Backend>
|
||||
createInstance(
|
||||
size_t keyBytes,
|
||||
Section const& keyValues,
|
||||
std::size_t burstSize,
|
||||
Scheduler& scheduler,
|
||||
beast::Journal journal) override
|
||||
{
|
||||
return std::make_unique<SnugDBBackend>(keyValues, scheduler, journal);
|
||||
}
|
||||
|
||||
std::unique_ptr<Backend>
|
||||
createInstance(
|
||||
size_t keyBytes,
|
||||
Section const& keyValues,
|
||||
std::size_t burstSize,
|
||||
Scheduler& scheduler,
|
||||
nudb::context& context,
|
||||
beast::Journal journal) override
|
||||
{
|
||||
return std::make_unique<SnugDBBackend>(keyValues, scheduler, journal);
|
||||
}
|
||||
};
|
||||
|
||||
static SnugDBFactory snugDBFactory;
|
||||
|
||||
} // namespace NodeStore
|
||||
} // namespace ripple
|
||||
741
src/ripple/nodestore/backend/snug.hpp
Normal file
741
src/ripple/nodestore/backend/snug.hpp
Normal file
@@ -0,0 +1,741 @@
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <fcntl.h>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <shared_mutex>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
|
||||
#define MMAPFLAGS (PROT_READ | PROT_WRITE | MAP_NORESERVE), MAP_SHARED
|
||||
namespace snug {
|
||||
|
||||
int
|
||||
compare_entries_reverse(const void* a, const void* b)
|
||||
{
|
||||
const uint64_t* a_key = static_cast<const uint64_t*>(a);
|
||||
const uint64_t* b_key = static_cast<const uint64_t*>(b);
|
||||
|
||||
// Unrolled comparison of 4 uint64_t values (4 * 8 = 32 bytes)
|
||||
if (b_key[0] > a_key[0])
|
||||
return 1;
|
||||
if (b_key[0] < a_key[0])
|
||||
return -1;
|
||||
|
||||
if (b_key[1] > a_key[1])
|
||||
return 1;
|
||||
if (b_key[1] < a_key[1])
|
||||
return -1;
|
||||
|
||||
if (b_key[2] > a_key[2])
|
||||
return 1;
|
||||
if (b_key[2] < a_key[2])
|
||||
return -1;
|
||||
|
||||
if (b_key[3] > a_key[3])
|
||||
return 1;
|
||||
if (b_key[3] < a_key[3])
|
||||
return -1;
|
||||
|
||||
return 0; // Keys are equal
|
||||
}
|
||||
|
||||
class SnugDB
|
||||
{
|
||||
private:
|
||||
static constexpr uint64_t SNUGSIZE =
|
||||
256ull * 1024ull * 1024ull * 1024ull; // 256 GiB
|
||||
static constexpr uint64_t BIGSIZE =
|
||||
10ull * 1024ull * 1024ull * 1024ull * 1024ull; // 10 TiB
|
||||
|
||||
static constexpr size_t BUCKET_COUNT = 1048576;
|
||||
|
||||
std::unique_ptr<std::shared_mutex[]> mutexes =
|
||||
std::make_unique<std::shared_mutex[]>(BUCKET_COUNT);
|
||||
|
||||
// each file snug.0 snug.1 ... is mmaped and the pointer
|
||||
uint8_t* mapped_files[1024];
|
||||
uint64_t mapped_files_count{0};
|
||||
|
||||
uint8_t* big_file; // this file has 64kib blocks in it which are used
|
||||
// as an overflow for large blobs
|
||||
|
||||
std::mutex big_file_mutex; // locked when incrementing the "next new block"
|
||||
// pointer
|
||||
|
||||
// only used when adding a new file
|
||||
std::mutex mapped_files_count_mutex;
|
||||
|
||||
std::string const path;
|
||||
|
||||
// 0 = success
|
||||
// 1 = could not open
|
||||
// 2 = could not seek
|
||||
// 3 = could not write at end of file
|
||||
int
|
||||
alloc_file(char const* fn, uint64_t size)
|
||||
{
|
||||
int fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0644);
|
||||
if (fd < 0)
|
||||
return 1;
|
||||
|
||||
// must be a multiple of bufsize
|
||||
|
||||
if (lseek(fd, size, SEEK_SET) == -1)
|
||||
{
|
||||
close(fd);
|
||||
unlink(fn);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (write(fd, "", 1) != 1)
|
||||
{
|
||||
close(fd);
|
||||
unlink(fn);
|
||||
return 3;
|
||||
}
|
||||
|
||||
close(fd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 0 = file exists and is right size
|
||||
int
|
||||
check_file(char const* fn, uint64_t size)
|
||||
{
|
||||
struct stat st;
|
||||
int file_exists = (stat(fn, &st) == 0);
|
||||
|
||||
if (!file_exists)
|
||||
return 1;
|
||||
|
||||
if (st.st_size != size + 1)
|
||||
return 2;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define OFFSET(byte0, byte1, byte2) \
|
||||
(((((uint64_t)(byte0 & 0xFFU)) << 12) + \
|
||||
(((uint64_t)(byte1 & 0xFFU)) << 4) + ((uint64_t)(byte2 & 0xFU))) \
|
||||
<< 18)
|
||||
|
||||
// check if 32 bytes are 0, which they will be for a zero entry
|
||||
#define IS_ZERO_ENTRY(x) \
|
||||
(*((uint64_t*)((x) + 0)) == 0 && *((uint64_t*)((x) + 8)) == 0 && \
|
||||
*((uint64_t*)((x) + 16)) == 0 && *((uint64_t*)((x) + 24)) == 0)
|
||||
|
||||
#define IS_ENTRY(x, y) \
|
||||
(*((uint64_t*)((x) + 0)) == *((uint64_t*)((y) + 0)) && \
|
||||
*((uint64_t*)((x) + 8)) == *((uint64_t*)((y) + 8)) && \
|
||||
*((uint64_t*)((x) + 16)) == *((uint64_t*)((y) + 16)) && \
|
||||
*((uint64_t*)((x) + 24)) == *((uint64_t*)((y) + 24)))
|
||||
|
||||
#define WRITE_KEY(x /* dst */, y /* src */, flags) \
|
||||
{ \
|
||||
*((uint64_t*)((x) + 0)) = *((uint64_t*)((y) + 0)); \
|
||||
*((uint64_t*)((x) + 8)) = *((uint64_t*)((y) + 8)); \
|
||||
*((uint64_t*)((x) + 16)) = *((uint64_t*)((y) + 16)); \
|
||||
*((uint64_t*)((x) + 24)) = *((uint64_t*)((y) + 24)); \
|
||||
*((uint64_t*)((x) + 32)) = flags; \
|
||||
}
|
||||
|
||||
// if an entry exceeds 984 bytes then the overflow is written
|
||||
// into the snug.big file in a linked list of 32kib blocks
|
||||
// the first of those blocks is a control block
|
||||
|
||||
uint64_t
|
||||
get_big_block()
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(big_file_mutex);
|
||||
|
||||
uint64_t free_blocks = *((uint64_t*)(big_file + 8));
|
||||
if (free_blocks == 0)
|
||||
{
|
||||
// no free blocks, allocate a new one
|
||||
uint64_t next_block = *((uint64_t*)big_file);
|
||||
// special edge case, first block ever allocated:
|
||||
if (!next_block)
|
||||
next_block += 32768;
|
||||
|
||||
*((uint64_t*)(big_file)) = next_block + 32768;
|
||||
|
||||
if (next_block + 32768 > BIGSIZE)
|
||||
return 0;
|
||||
|
||||
return next_block;
|
||||
}
|
||||
|
||||
// grab the nth one
|
||||
uint8_t* offset = big_file + 16 + 8 * (free_blocks - 1);
|
||||
|
||||
// decrement free block counter
|
||||
*(uint64_t*)(big_file + 8) -= 1;
|
||||
|
||||
return *((uint64_t*)offset);
|
||||
}
|
||||
|
||||
void
|
||||
unalloc_blocks(uint64_t next_block)
|
||||
{
|
||||
if (next_block != 0)
|
||||
{
|
||||
// scope the lock only if called with non-zero nextblock
|
||||
std::unique_lock<std::mutex> lock(big_file_mutex);
|
||||
do
|
||||
{
|
||||
uint64_t free_blocks = *((uint64_t*)(big_file + 8));
|
||||
|
||||
if (free_blocks >= 4095)
|
||||
break;
|
||||
|
||||
uint8_t* offset = big_file + 16 + 8 * free_blocks;
|
||||
|
||||
*((uint64_t*)offset) = next_block;
|
||||
|
||||
*((uint64_t*)(big_file + 8)) += 1;
|
||||
|
||||
uint8_t* big_ptr = big_file + next_block;
|
||||
uint64_t previous = next_block;
|
||||
next_block = *((uint64_t*)(big_file + next_block));
|
||||
|
||||
// clear the pointer on the old block
|
||||
*((uint64_t*)(big_file + previous)) = 0;
|
||||
} while (next_block != 0);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* First big entry is control block:
|
||||
* 0 - 7: The next free new block
|
||||
* 8 - 15: The number of free blocks blow
|
||||
* 16 - 23 [... repeating]: The next free unused block
|
||||
*/
|
||||
/*
|
||||
* Big entry format:
|
||||
* 0 - 7: next block in chain, if any.
|
||||
* 8 - 32767: payload
|
||||
*/
|
||||
|
||||
// return 0 = failure
|
||||
// > 0 = first block in the chain
|
||||
uint64_t
|
||||
write_big_entry_internal(uint8_t* data, ssize_t len, uint64_t next_block)
|
||||
{
|
||||
uint64_t first_block = 0;
|
||||
|
||||
uint64_t* last_block_ptr = 0;
|
||||
do
|
||||
{
|
||||
// if next_block is populated we follow an existing pathway
|
||||
// otherwise allocate a new block now
|
||||
|
||||
if (!next_block)
|
||||
next_block = get_big_block();
|
||||
|
||||
if (!next_block)
|
||||
return 0;
|
||||
|
||||
if (!first_block)
|
||||
first_block = next_block;
|
||||
|
||||
if (last_block_ptr)
|
||||
*last_block_ptr = next_block;
|
||||
|
||||
uint8_t* big_ptr = big_file + next_block;
|
||||
|
||||
// copy to the block
|
||||
ssize_t to_write = len > 32760 ? 32760 : len;
|
||||
memcpy(big_ptr + 8, data, to_write);
|
||||
|
||||
data += to_write;
|
||||
len -= to_write;
|
||||
|
||||
next_block = *((uint64_t*)big_ptr);
|
||||
last_block_ptr = (uint64_t*)big_ptr;
|
||||
} while (len > 0);
|
||||
|
||||
// if there's a dangling chain we'll unallocate it
|
||||
if (next_block != 0)
|
||||
unalloc_blocks(next_block);
|
||||
|
||||
return first_block;
|
||||
}
|
||||
|
||||
/*
|
||||
* Entry format:
|
||||
* 0 - 31: the 32 byte key
|
||||
* 32 - 39: flags (high 4 bytes are flags, low 4 are size)
|
||||
* 40 - 1023: data (up to 984 bytes)
|
||||
*/
|
||||
// 0 = success
|
||||
// 1 = bucket full
|
||||
// 2 = big blocks full
|
||||
int
|
||||
write_entry_internal(
|
||||
uint8_t* data,
|
||||
uint8_t* key,
|
||||
uint8_t* val,
|
||||
uint32_t len)
|
||||
{
|
||||
// find the entry
|
||||
uint64_t offset = OFFSET(key[0], key[1], (key[2] >> 4));
|
||||
|
||||
// lock the bucket for writing
|
||||
std::unique_lock<std::shared_mutex> lock(mutexes[offset >> 18]);
|
||||
|
||||
uint8_t* start = data + offset;
|
||||
for (int i = 0; i < 256 * 1024; i += 1024)
|
||||
{
|
||||
bool const found = IS_ENTRY(start + i, key);
|
||||
if (!found && !IS_ZERO_ENTRY(start + i))
|
||||
continue;
|
||||
|
||||
// special edge case: the key doesn't exist and they're trying to
|
||||
// delete it
|
||||
if (!found && len == 0)
|
||||
return 0;
|
||||
|
||||
// read flags
|
||||
uint64_t flags = *((uint64_t*)(start + i + 32));
|
||||
|
||||
// big entries are tricky
|
||||
bool const old_big = (flags >> 32) != 0;
|
||||
bool const new_big = len > 984;
|
||||
|
||||
if (new_big)
|
||||
{
|
||||
// write_big_entry_internal(uint8_t* data, ssize_t len, uint64_t
|
||||
// next_block)
|
||||
uint64_t first_block = write_big_entry_internal(
|
||||
val + 984, len - 984, (old_big ? (flags >> 32) : 0));
|
||||
|
||||
if (first_block == 0) // error state
|
||||
{
|
||||
if (old_big)
|
||||
unalloc_blocks(flags >> 32);
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
flags = (first_block << 32) + len;
|
||||
}
|
||||
else if (old_big) // big blocks exist but new value is small
|
||||
{
|
||||
// unallocate the old chain
|
||||
unalloc_blocks(flags >> 32);
|
||||
}
|
||||
|
||||
if (!new_big)
|
||||
flags = len;
|
||||
|
||||
if (len == 0)
|
||||
{
|
||||
// deletion requests are written as zero keys
|
||||
memset(start + i, 0, 1024);
|
||||
}
|
||||
else
|
||||
{
|
||||
/// write entry
|
||||
WRITE_KEY(start + i, key, flags);
|
||||
memcpy(start + i + 40, val, (len > 984 ? 984 : len));
|
||||
}
|
||||
|
||||
// sort the bucket backwards so 0's appear at the end
|
||||
qsort(start, 256, 1024, compare_entries_reverse);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// file (bucket) full
|
||||
return 1;
|
||||
}
|
||||
|
||||
// out_len carries the length of the output buffer when calling and is
|
||||
// replaced with the length of the data found when returning
|
||||
int
|
||||
read_entry_internal(
|
||||
uint8_t* data,
|
||||
uint8_t* key,
|
||||
uint8_t* val_out,
|
||||
uint64_t* out_len)
|
||||
{
|
||||
uint64_t buf_len = *out_len;
|
||||
|
||||
// find the entry
|
||||
uint64_t offset = OFFSET(key[0], key[1], (key[2] >> 4));
|
||||
uint8_t* start = data + offset;
|
||||
|
||||
// lock the bucket for reading
|
||||
std::shared_lock<std::shared_mutex> lock(mutexes[offset >> 18]);
|
||||
|
||||
for (int i = 0; i < 256 * 1024; i += 1024)
|
||||
{
|
||||
if (IS_ZERO_ENTRY(start + i))
|
||||
return 1;
|
||||
|
||||
if (!IS_ENTRY(start + i, key))
|
||||
continue;
|
||||
|
||||
// read out the value
|
||||
|
||||
uint64_t flags = *((uint64_t*)(start + i + 32));
|
||||
|
||||
uint32_t size = flags & 0xFFFFFFFFUL;
|
||||
uint64_t next_block = flags >> 32;
|
||||
|
||||
if (size > buf_len)
|
||||
return 2;
|
||||
|
||||
*out_len = size;
|
||||
|
||||
size_t to_read = size > 984 ? 984 : size;
|
||||
memcpy(val_out, start + i + 40, to_read);
|
||||
|
||||
val_out += to_read;
|
||||
size -= to_read;
|
||||
|
||||
// big block read logic
|
||||
while (size > 0)
|
||||
{
|
||||
// follow big block pointers
|
||||
if (!next_block)
|
||||
{
|
||||
printf("End while size=%d\n", size);
|
||||
return 3;
|
||||
}
|
||||
|
||||
uint8_t* big_ptr = big_file + next_block;
|
||||
to_read = size > 32760 ? 32760 : size;
|
||||
memcpy(val_out, big_ptr + 8, to_read);
|
||||
|
||||
val_out += to_read;
|
||||
size -= to_read;
|
||||
|
||||
next_block = *((uint64_t*)big_ptr);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
void
|
||||
setup()
|
||||
{
|
||||
struct stat path_stat;
|
||||
|
||||
if (stat(path.c_str(), &path_stat) != 0)
|
||||
throw std::runtime_error(
|
||||
"Error checking path: " + path + " - " +
|
||||
std::string(strerror(errno)));
|
||||
|
||||
if (!S_ISDIR(path_stat.st_mode))
|
||||
throw std::runtime_error("Path is not a directory: " + path);
|
||||
|
||||
if (access(path.c_str(), R_OK | W_OK | X_OK) != 0)
|
||||
throw std::runtime_error(
|
||||
"Insufficient permissions for path: " + path);
|
||||
|
||||
// Search for existing snug files sequentially
|
||||
std::vector<std::string> snug_files;
|
||||
for (int file_index = 0; file_index < 1024; ++file_index)
|
||||
{
|
||||
std::string filename = "snug." + std::to_string(file_index);
|
||||
std::string full_path = path + "/" + filename;
|
||||
|
||||
if (access(full_path.c_str(), F_OK) != 0)
|
||||
break;
|
||||
|
||||
snug_files.push_back(filename);
|
||||
}
|
||||
|
||||
// If no files found, create snug.0
|
||||
if (snug_files.empty())
|
||||
{
|
||||
std::string new_file = path + "/snug.0";
|
||||
int result = alloc_file(new_file.c_str(), SNUGSIZE);
|
||||
if (result != 0)
|
||||
throw std::runtime_error(
|
||||
"Failed to create initial file: " + new_file);
|
||||
snug_files.push_back("snug.0");
|
||||
}
|
||||
|
||||
// Memory map all files
|
||||
for (const auto& file : snug_files)
|
||||
{
|
||||
std::string full_path = path + "/" + file;
|
||||
if (check_file(full_path.c_str(), SNUGSIZE) != 0)
|
||||
throw std::runtime_error("File was the wrong size: " + file);
|
||||
|
||||
int fd = open(full_path.c_str(), O_RDWR);
|
||||
if (fd == -1)
|
||||
throw std::runtime_error("Unable to open file: " + full_path);
|
||||
|
||||
struct stat file_stat;
|
||||
if (fstat(fd, &file_stat) == -1)
|
||||
{
|
||||
close(fd);
|
||||
throw std::runtime_error(
|
||||
"Unable to get file stats: " + full_path);
|
||||
}
|
||||
|
||||
void* mapped = mmap(nullptr, file_stat.st_size, MMAPFLAGS, fd, 0);
|
||||
close(fd); // Can close fd after mmap
|
||||
|
||||
if (mapped == MAP_FAILED)
|
||||
throw std::runtime_error("Unable to mmap file: " + full_path);
|
||||
|
||||
mapped_files[mapped_files_count++] = static_cast<uint8_t*>(mapped);
|
||||
}
|
||||
|
||||
// create and map snug.big overflow file
|
||||
{
|
||||
std::string new_file = path + "/snug.big";
|
||||
if (check_file(new_file.c_str(), BIGSIZE) != 0)
|
||||
{
|
||||
int result = alloc_file(new_file.c_str(), BIGSIZE);
|
||||
if (result != 0)
|
||||
throw std::runtime_error(
|
||||
"Failed to create initial file: " + new_file);
|
||||
}
|
||||
|
||||
int fd = open(new_file.c_str(), O_RDWR);
|
||||
if (fd == -1)
|
||||
throw std::runtime_error("Unable to open file: " + new_file);
|
||||
|
||||
struct stat file_stat;
|
||||
if (fstat(fd, &file_stat) == -1)
|
||||
{
|
||||
close(fd);
|
||||
throw std::runtime_error(
|
||||
"Unable to get file stats: " + new_file);
|
||||
}
|
||||
|
||||
void* mapped = mmap(nullptr, file_stat.st_size, MMAPFLAGS, fd, 0);
|
||||
close(fd); // Can close fd after mmap
|
||||
|
||||
if (mapped == MAP_FAILED)
|
||||
throw std::runtime_error("Unable to mmap file: " + new_file);
|
||||
|
||||
big_file = static_cast<uint8_t*>(mapped);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
SnugDB(std::string path_) : path(path_)
|
||||
{
|
||||
setup();
|
||||
}
|
||||
|
||||
~SnugDB()
|
||||
{
|
||||
// Unmap all files in destructor
|
||||
// RH TODO: consider lock here
|
||||
for (int i = 0; i < mapped_files_count; ++i)
|
||||
munmap(mapped_files[i], SNUGSIZE);
|
||||
|
||||
// unmap the big file
|
||||
munmap(big_file, BIGSIZE);
|
||||
}
|
||||
|
||||
int
|
||||
write_entry(uint8_t* key, uint8_t* val, ssize_t len)
|
||||
{
|
||||
for (size_t i = 0; i < mapped_files_count; ++i)
|
||||
{
|
||||
int result = write_entry_internal(mapped_files[i], key, val, len);
|
||||
if (result == 0)
|
||||
return 0;
|
||||
|
||||
if (result != 1) // only bucket full falls through
|
||||
return result;
|
||||
}
|
||||
|
||||
// All existing files are full, allocate a new one
|
||||
{
|
||||
// acquire the mutex
|
||||
const std::lock_guard<std::mutex> lock(mapped_files_count_mutex);
|
||||
|
||||
std::string new_file =
|
||||
path + "/snug." + std::to_string(mapped_files_count);
|
||||
int alloc_result = alloc_file(new_file.c_str(), SNUGSIZE);
|
||||
if (alloc_result != 0)
|
||||
return alloc_result +
|
||||
10; // Return error code from alloc_file if it fails (+10)
|
||||
|
||||
int fd = open(new_file.c_str(), O_RDWR);
|
||||
if (fd == -1)
|
||||
return 1; // Return 1 for open failure
|
||||
|
||||
struct stat file_stat;
|
||||
if (fstat(fd, &file_stat) == -1)
|
||||
{
|
||||
close(fd);
|
||||
return 2; // Return 2 for fstat failure
|
||||
}
|
||||
|
||||
void* mapped = mmap(
|
||||
nullptr,
|
||||
file_stat.st_size,
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED,
|
||||
fd,
|
||||
0);
|
||||
close(fd); // Can close fd after mmap
|
||||
|
||||
if (mapped == MAP_FAILED)
|
||||
return 3; // Return 3 for mmap failure
|
||||
|
||||
// add the new file to the map, and increment the counter
|
||||
mapped_files[mapped_files_count] = static_cast<uint8_t*>(mapped);
|
||||
|
||||
// this is the last possible thing done
|
||||
mapped_files_count++;
|
||||
}
|
||||
|
||||
// finally write the entry
|
||||
// RH TODO: consider adding a recursion guard here
|
||||
return write_entry(key, val, len);
|
||||
}
|
||||
|
||||
int
|
||||
read_entry(uint8_t* key, uint8_t* val_out, uint64_t* out_len_orig)
|
||||
{
|
||||
for (size_t i = 0; i < mapped_files_count; ++i)
|
||||
{
|
||||
uint64_t out_len = *out_len_orig;
|
||||
|
||||
int result =
|
||||
read_entry_internal(mapped_files[i], key, val_out, &out_len);
|
||||
|
||||
if (result == 0)
|
||||
{
|
||||
*out_len_orig = out_len;
|
||||
return 0; // Entry found and read successfully
|
||||
}
|
||||
|
||||
if (result == 2)
|
||||
return 2; // Output buffer too small
|
||||
}
|
||||
|
||||
// Entry not found in any file
|
||||
return 1;
|
||||
}
|
||||
|
||||
void
|
||||
visit_all(
|
||||
void (*f)(uint8_t*, uint8_t*, uint64_t, void* /*opaque caller val*/),
|
||||
void* opaque)
|
||||
{
|
||||
// to visit all we only need to check snug.0 to begin with
|
||||
// we go to the first bucket
|
||||
// if we find no entries there we go to the next bucket
|
||||
// if we find entries there then we need to count them,
|
||||
// if we find 256 entries there then we go to snug.1 and so on until we
|
||||
// run out we merge sort the entries into a list for the visit
|
||||
|
||||
for (uint64_t bucket = 0; bucket < BUCKET_COUNT; ++bucket)
|
||||
{
|
||||
// acquire the bucket lock
|
||||
std::shared_lock<std::shared_mutex> lock(mutexes[bucket]);
|
||||
|
||||
// check the bucket
|
||||
uint8_t* ptr = mapped_files[0] + (bucket << 18);
|
||||
|
||||
if (*((uint64_t*)(ptr + 32)) == 0)
|
||||
continue;
|
||||
|
||||
// if (IS_ZERO_ENTRY(ptr))
|
||||
// continue;
|
||||
|
||||
// live bucket, collect entries
|
||||
std::vector<uint8_t*> entries;
|
||||
{
|
||||
// need to acquire the mutex to prevent a race condition
|
||||
// where a new file is being added while we're searching
|
||||
const std::lock_guard<std::mutex> lock(
|
||||
mapped_files_count_mutex);
|
||||
|
||||
// preallocate worst case scenario, RIP memory
|
||||
entries.reserve(mapped_files_count * 256);
|
||||
|
||||
for (int i = 0; i < mapped_files_count; ++i)
|
||||
{
|
||||
uint8_t* ptr = mapped_files[i] + (bucket << 18);
|
||||
for (int entry_count = 0;
|
||||
!IS_ZERO_ENTRY(ptr) && entry_count < 256;
|
||||
++entry_count, ptr += 1024)
|
||||
entries.push_back(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
if (entries.empty())
|
||||
continue;
|
||||
|
||||
// sort the entries
|
||||
std::sort(
|
||||
entries.begin(),
|
||||
entries.end(),
|
||||
[](const uint8_t* a, const uint8_t* b) {
|
||||
return memcmp(a, b, 32) < 0;
|
||||
});
|
||||
|
||||
for (auto e : entries)
|
||||
{
|
||||
// visitation
|
||||
uint8_t* entry = &e[0];
|
||||
uint64_t flags = *((uint64_t*)(entry + 32));
|
||||
uint64_t next_block = flags >> 32;
|
||||
uint64_t size = flags & 0xFFFFFFFFULL;
|
||||
|
||||
if (size <= 984)
|
||||
{
|
||||
f(entry, entry + 40, size, opaque);
|
||||
continue;
|
||||
}
|
||||
|
||||
// copy big entry to a buffer
|
||||
std::unique_ptr<uint8_t[]> copybuf =
|
||||
std::make_unique<uint8_t[]>(size);
|
||||
|
||||
uint8_t* data = &(copybuf[0]);
|
||||
memcpy(data, entry + 40, 984);
|
||||
|
||||
data += 984;
|
||||
size -= 984;
|
||||
|
||||
// big block read logic
|
||||
while (size > 0)
|
||||
{
|
||||
// follow big block pointers
|
||||
if (!next_block)
|
||||
{
|
||||
printf("End while size=%lu\n", size);
|
||||
return;
|
||||
}
|
||||
|
||||
uint8_t* big_ptr = big_file + next_block;
|
||||
uint64_t to_read = size > 32760 ? 32760 : size;
|
||||
memcpy(data, big_ptr + 8, to_read);
|
||||
|
||||
data += to_read;
|
||||
size -= to_read;
|
||||
|
||||
next_block = *((uint64_t*)big_ptr);
|
||||
}
|
||||
|
||||
f(entry, data, (flags & 0xFFFFFFFFULL), opaque);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace snug
|
||||
Reference in New Issue
Block a user