// // Copyright (c) 2015-2016 Vinnie Falco (vinnie dot falco at gmail dot com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // #ifndef NUDB_IMPL_BASIC_STORE_IPP #define NUDB_IMPL_BASIC_STORE_IPP #include #include #include #include #include #ifndef NUDB_DEBUG_LOG #define NUDB_DEBUG_LOG 0 #endif #if NUDB_DEBUG_LOG #include #include #endif namespace nudb { template basic_store::state:: state(File&& df_, File&& kf_, File&& lf_, path_type const& dp_, path_type const& kp_, path_type const& lp_, detail::key_file_header const& kh_) : df(std::move(df_)) , kf(std::move(kf_)) , lf(std::move(lf_)) , dp(dp_) , kp(kp_) , lp(lp_) , hasher(kh_.salt) , p0(kh_.key_size, "p0") , p1(kh_.key_size, "p1") , c1(kh_.key_size, kh_.block_size, "c1") , kh(kh_) { static_assert(is_File::value, "File requirements not met"); } //------------------------------------------------------------------------------ template basic_store:: ~basic_store() { error_code ec; // We call close here to make sure data is intact // if an exception destroys the basic_store, but callers // should always call close manually to receive the // error code. close(ec); } template path_type const& basic_store:: dat_path() const { BOOST_ASSERT(is_open()); return s_->dp; } template path_type const& basic_store:: key_path() const { BOOST_ASSERT(is_open()); return s_->kp; } template path_type const& basic_store:: log_path() const { BOOST_ASSERT(is_open()); return s_->lp; } template std::uint64_t basic_store:: appnum() const { BOOST_ASSERT(is_open()); return s_->kh.appnum; } template std::size_t basic_store:: key_size() const { BOOST_ASSERT(is_open()); return s_->kh.key_size; } template std::size_t basic_store:: block_size() const { BOOST_ASSERT(is_open()); return s_->kh.block_size; } template template void basic_store:: open( path_type const& dat_path, path_type const& key_path, path_type const& log_path, error_code& ec, Args&&... args) { static_assert(is_Hasher::value, "Hasher requirements not met"); using namespace detail; BOOST_ASSERT(! is_open()); ec_ = {}; ecb_.store(false); recover( dat_path, key_path, log_path, ec, args...); if(ec) return; File df(args...); File kf(args...); File lf(args...); df.open(file_mode::append, dat_path, ec); if(ec) return; kf.open(file_mode::write, key_path, ec); if(ec) return; lf.create(file_mode::append, log_path, ec); if(ec) return; // VFALCO TODO Erase empty log file if this // function subsequently fails. dat_file_header dh; read(df, dh, ec); if(ec) return; verify(dh, ec); if(ec) return; key_file_header kh; read(kf, kh, ec); if(ec) return; verify(kh, ec); if(ec) return; verify(dh, kh, ec); if(ec) return; boost::optional s; s.emplace(std::move(df), std::move(kf), std::move(lf), dat_path, key_path, log_path, kh); thresh_ = std::max(65536UL, kh.load_factor * kh.capacity); frac_ = thresh_ / 2; buckets_ = kh.buckets; modulus_ = ceil_pow2(kh.buckets); // VFALCO TODO This could be better if(buckets_ < 1) { ec = error::short_key_file; return; } dataWriteSize_ = 32 * nudb::block_size(dat_path); logWriteSize_ = 32 * nudb::block_size(log_path); s_.emplace(std::move(*s)); open_ = true; t_ = std::thread(&basic_store::run, this); } template void basic_store:: close(error_code& ec) { if(open_) { open_ = false; cv_.notify_all(); t_.join(); if(ecb_) { ec = ec_; return; } s_->lf.close(); state s{std::move(*s_)}; File::erase(s.lp, ec_); if(ec_) ec = ec_; } } template template void basic_store:: fetch( void const* key, Callback && callback, error_code& ec) { using namespace detail; BOOST_ASSERT(is_open()); if(ecb_) { ec = ec_; return; } auto const h = hash(key, s_->kh.key_size, s_->hasher); shared_lock_type m{m_}; { auto iter = s_->p1.find(key); if(iter == s_->p1.end()) { iter = s_->p0.find(key); if(iter == s_->p0.end()) goto cont; } callback(iter->first.data, iter->first.size); return; } cont: auto const n = bucket_index(h, buckets_, modulus_); auto const iter = s_->c1.find(n); if(iter != s_->c1.end()) return fetch(h, key, iter->second, callback, ec); genlock g{g_}; m.unlock(); buffer buf{s_->kh.block_size}; // b constructs from uninitialized buf bucket b{s_->kh.block_size, buf.get()}; b.read(s_->kf, (n + 1) * b.block_size(), ec); if(ec) return; fetch(h, key, b, callback, ec); } template void basic_store:: insert( void const* key, void const* data, nsize_t size, error_code& ec) { using namespace detail; using namespace std::chrono; BOOST_ASSERT(is_open()); if(ecb_) { ec = ec_; return; } // Data Record BOOST_ASSERT(size > 0); // zero disallowed BOOST_ASSERT(size <= field::max); // too large auto const h = hash(key, s_->kh.key_size, s_->hasher); std::lock_guard u{u_}; { shared_lock_type m{m_}; if(s_->p1.find(key) != s_->p1.end() || s_->p0.find(key) != s_->p0.end()) { ec = error::key_exists; return; } auto const n = bucket_index(h, buckets_, modulus_); auto const iter = s_->c1.find(n); if(iter != s_->c1.end()) { auto const found = exists( h, key, &m, iter->second, ec); if(ec) return; if(found) { ec = error::key_exists; return; } // m is now unlocked } else { // VFALCO Audit for concurrency genlock g{g_}; m.unlock(); buffer buf; buf.reserve(s_->kh.block_size); bucket b{s_->kh.block_size, buf.get()}; b.read(s_->kf, static_cast(n + 1) * s_->kh.block_size, ec); if(ec) return; auto const found = exists(h, key, nullptr, b, ec); if(ec) return; if(found) { ec = error::key_exists; return; } } } // Perform insert unique_lock_type m{m_}; s_->p1.insert(h, key, data, size); auto const now = clock_type::now(); auto const elapsed = duration_cast>( now > s_->when ? now - s_->when : clock_type::duration{1}); auto const work = s_->p1.data_size() + 3 * s_->p1.size() * s_->kh.block_size; auto const rate = static_cast( std::ceil(work / elapsed.count())); auto const sleep = s_->rate && rate > s_->rate; m.unlock(); if(sleep) std::this_thread::sleep_for(milliseconds{25}); } // Fetch key in loaded bucket b or its spills. // template template void basic_store:: fetch( detail::nhash_t h, void const* key, detail::bucket b, Callback&& callback, error_code& ec) { using namespace detail; buffer buf0; buffer buf1; for(;;) { for(auto i = b.lower_bound(h); i < b.size(); ++i) { auto const item = b[i]; if(item.hash != h) break; // Data Record auto const len = s_->kh.key_size + // Key item.size; // Value buf0.reserve(len); s_->df.read(item.offset + field::size, // Size buf0.get(), len, ec); if(ec) return; if(std::memcmp(buf0.get(), key, s_->kh.key_size) == 0) { callback( buf0.get() + s_->kh.key_size, item.size); return; } } auto const spill = b.spill(); if(! spill) break; buf1.reserve(s_->kh.block_size); b = bucket(s_->kh.block_size, buf1.get()); b.read(s_->df, spill, ec); if(ec) return; } ec = error::key_not_found; } // Returns `true` if the key exists // lock is unlocked after the first bucket processed // template bool basic_store:: exists( detail::nhash_t h, void const* key, detail::shared_lock_type* lock, detail::bucket b, error_code& ec) { using namespace detail; buffer buf{s_->kh.key_size + s_->kh.block_size}; void* pk = buf.get(); void* pb = buf.get() + s_->kh.key_size; for(;;) { for(auto i = b.lower_bound(h); i < b.size(); ++i) { auto const item = b[i]; if(item.hash != h) break; // Data Record s_->df.read(item.offset + field::size, // Size pk, s_->kh.key_size, ec); // Key if(ec) return false; if(std::memcmp(pk, key, s_->kh.key_size) == 0) return true; } auto spill = b.spill(); if(lock && lock->owns_lock()) lock->unlock(); if(! spill) break; b = bucket(s_->kh.block_size, pb); b.read(s_->df, spill, ec); if(ec) return false; } return false; } // Split the bucket in b1 to b2 // b1 must be loaded // tmp is used as a temporary buffer // splits are written but not the new buckets // template void basic_store:: split( detail::bucket& b1, detail::bucket& b2, detail::bucket& tmp, nbuck_t n1, nbuck_t n2, nbuck_t buckets, nbuck_t modulus, detail::bulk_writer& w, error_code& ec) { using namespace detail; // Trivial case: split empty bucket if(b1.empty()) return; // Split for(std::size_t i = 0; i < b1.size();) { auto const e = b1[i]; auto const n = bucket_index(e.hash, buckets, modulus); (void)n1; (void)n2; BOOST_ASSERT(n==n1 || n==n2); if(n == n2) { b2.insert(e.offset, e.size, e.hash); b1.erase(i); } else { ++i; } } noff_t spill = b1.spill(); if(spill) { b1.spill(0); do { // If any part of the spill record is // in the write buffer then flush first if(spill + bucket_size(s_->kh.capacity) > w.offset() - w.size()) { w.flush(ec); if(ec) return; } tmp.read(s_->df, spill, ec); if(ec) return; for(std::size_t i = 0; i < tmp.size(); ++i) { auto const e = tmp[i]; auto const n = bucket_index( e.hash, buckets, modulus); BOOST_ASSERT(n==n1 || n==n2); if(n == n2) { maybe_spill(b2, w, ec); if(ec) return; b2.insert(e.offset, e.size, e.hash); } else { maybe_spill(b1, w, ec); if(ec) return; b1.insert(e.offset, e.size, e.hash); } } spill = tmp.spill(); } while(spill); } } template detail::bucket basic_store:: load( nbuck_t n, detail::cache& c1, detail::cache& c0, void* buf, error_code& ec) { using namespace detail; auto iter = c1.find(n); if(iter != c1.end()) return iter->second; iter = c0.find(n); if(iter != c0.end()) return c1.insert(n, iter->second)->second; bucket tmp{s_->kh.block_size, buf}; tmp.read(s_->kf, static_cast(n + 1) * s_->kh.block_size, ec); if(ec) return {}; c0.insert(n, tmp); return c1.insert(n, tmp)->second; } template void basic_store:: commit(detail::unique_lock_type& m, std::size_t& work, error_code& ec) { using namespace detail; BOOST_ASSERT(m.owns_lock()); BOOST_ASSERT(! s_->p1.empty()); swap(s_->p0, s_->p1); m.unlock(); work = s_->p0.data_size(); cache c0(s_->kh.key_size, s_->kh.block_size, "c0"); cache c1(s_->kh.key_size, s_->kh.block_size, "c1"); // 0.63212 ~= 1 - 1/e { auto const size = static_cast( std::ceil(0.63212 * s_->p0.size())); c0.reserve(size); c1.reserve(size); } buffer buf1{s_->kh.block_size}; buffer buf2{s_->kh.block_size}; bucket tmp{s_->kh.block_size, buf1.get()}; // Prepare rollback information log_file_header lh; lh.version = currentVersion; // Version lh.uid = s_->kh.uid; // UID lh.appnum = s_->kh.appnum; // Appnum lh.key_size = s_->kh.key_size; // Key Size lh.salt = s_->kh.salt; // Salt lh.pepper = pepper(lh.salt); // Pepper lh.block_size = s_->kh.block_size; // Block Size lh.key_file_size = s_->kf.size(ec); // Key File Size if(ec) return; lh.dat_file_size = s_->df.size(ec); // Data File Size if(ec) return; write(s_->lf, lh, ec); if(ec) return; // Checkpoint s_->lf.sync(ec); if(ec) return; // Append data and spills to data file auto modulus = modulus_; auto buckets = buckets_; { // Bulk write to avoid write amplification auto const size = s_->df.size(ec); if(ec) return; bulk_writer w{s_->df, size, dataWriteSize_}; // Write inserted data to the data file for(auto& e : s_->p0) { // VFALCO This could be UB since other // threads are reading other data members // of this object in memory e.second = w.offset(); auto os = w.prepare(value_size( e.first.size, s_->kh.key_size), ec); if(ec) return; // Data Record write(os, e.first.size); // Size write(os, e.first.key, s_->kh.key_size); // Key write(os, e.first.data, e.first.size); // Data } // Do inserts, splits, and build view // of original and modified buckets for(auto const e : s_->p0) { // VFALCO Should this be >= or > ? if((frac_ += 65536) >= thresh_) { // split frac_ -= thresh_; if(buckets == modulus) modulus *= 2; auto const n1 = buckets - (modulus / 2); auto const n2 = buckets++; auto b1 = load(n1, c1, c0, buf2.get(), ec); if(ec) return; auto b2 = c1.create(n2); // If split spills, the writer is // flushed which can amplify writes. split(b1, b2, tmp, n1, n2, buckets, modulus, w, ec); if(ec) return; } // Insert auto const n = bucket_index( e.first.hash, buckets, modulus); auto b = load(n, c1, c0, buf2.get(), ec); if(ec) return; // This can amplify writes if it spills. maybe_spill(b, w, ec); if(ec) return; b.insert(e.second, e.first.size, e.first.hash); } w.flush(ec); if(ec) return; } work += s_->kh.block_size * (2 * c0.size() + c1.size()); // Give readers a view of the new buckets. // This might be slightly better than the old // view since there could be fewer spills. m.lock(); swap(c1, s_->c1); s_->p0.clear(); buckets_ = buckets; modulus_ = modulus; g_.start(); m.unlock(); // Write clean buckets to log file { auto const size = s_->lf.size(ec); if(ec) return; bulk_writer w{s_->lf, size, logWriteSize_}; for(auto const e : c0) { // Log Record auto os = w.prepare( field::size + // Index e.second.actual_size(), ec); // Bucket if(ec) return; // Log Record write(os, e.first); // Index e.second.write(os); // Bucket } c0.clear(); w.flush(ec); if(ec) return; s_->lf.sync(ec); if(ec) return; } g_.finish(); // Write new buckets to key file for(auto const e : s_->c1) { e.second.write(s_->kf, (e.first + 1) * s_->kh.block_size, ec); if(ec) return; } // Finalize the commit s_->df.sync(ec); if(ec) return; s_->kf.sync(ec); if(ec) return; s_->lf.trunc(0, ec); if(ec) return; s_->lf.sync(ec); if(ec) return; // Cache is no longer needed, all fetches will go straight // to disk again. Do this after the sync, otherwise readers // might get blocked longer due to the extra I/O. m.lock(); s_->c1.clear(); } template void basic_store:: run() { using namespace std::chrono; using namespace detail; #if NUDB_DEBUG_LOG beast::unit_test::dstream dout{std::cout}; #endif for(;;) { unique_lock_type m{m_}; if(! s_->p1.empty()) { std::size_t work; commit(m, work, ec_); if(ec_) { ecb_.store(true); return; } BOOST_ASSERT(m.owns_lock()); auto const now = clock_type::now(); auto const elapsed = duration_cast>( now > s_->when ? now - s_->when : clock_type::duration{1}); s_->rate = static_cast( std::ceil(work / elapsed.count())); #if NUDB_DEBUG_LOG dout << "work=" << work << ", time=" << elapsed.count() << ", rate=" << s_->rate << "\n"; #endif } s_->p1.periodic_activity(); cv_.wait_until(m, s_->when + seconds{1}, [this]{ return ! open_; }); if(! open_) break; s_->when = clock_type::now(); } { unique_lock_type m{m_}; std::size_t work; if(! s_->p1.empty()) commit(m, work, ec_); } if(ec_) { ecb_.store(true); return; } } } // nudb #endif