mirror of
https://github.com/XRPLF/rippled.git
synced 2025-11-20 02:55:50 +00:00
Squashed 'src/hyperleveldb/' changes from ac2ae30..25511b7
25511b7 Merge branch 'master' of github.com:rescrv/HyperLevelDB into hyperdb ed01020 Make "source" universal 3784d92 Ignore the static file 507319b Don't package with snappy 3e2cc8b Tolerate -fno-rtti 4dcdd6e Drop revision down to 1.0.dev 2542163 Drop all but the latest kept for garbage reasons 9c270b7 Update .gitignore 5331878 Add upack script adc2a7a Explicitly add -lpthread for Ubuntu 7b57bbd Strip NULL chars passed to LiveBackup e3b87e7 Add write-buffer-size option to benchmark 2f11087 Followup to snappy support with -DSNAPPY af503da Improve efficiency of ReplayIterator; fix a bug 33c1f0c Add snappy support ce1cacf Fix a race in ReplayIterator 5c4679b Fix a bug in the replay_iterator ca332bd Fix sort algorithm used for compaction boundaries. d9ec544 make checK b83a9cd Fix a deadlock in the ReplayIterator dtor 273547b Fix a double-delete in ReplayIterator 3377c7a Add "all" to set of special timestamps 387f43a Timestamp comparison and validation. f9a6eb1 make distcheck 9a4d0b7 Add a ReplayIterator. 1d53869 Conditionally enable read-driven compaction. f6fa561 16% end-to-end performance improvement from the skiplist 28ffd32 Merge remote-tracking branch 'upstream/master' a58de73 Revert "Remove read-driven compactions." e19fc0c Fix upstream issue 200 748539c LevelDB 1.13 78b7812 Add install instructions to README e47a48e Make benchmark dir variable 820a096 Update distributed files. 486ca7f Live backup of LevelDB instances 6579884 Put a reference counter on log_/logfile_ 3075253 Update internal benchmark. 2a6b0bd Make the Version a parameter of PickCompaction 5bd76dc Release leveldb 1.12 git-subtree-dir: src/hyperleveldb git-subtree-split: 25511b7a9101b0bafb57349d2194ba80ccbf7bc3
This commit is contained in:
47
.gitignore
vendored
47
.gitignore
vendored
@@ -7,26 +7,27 @@
|
||||
*.o
|
||||
*_test
|
||||
# specific files
|
||||
aclocal.m4
|
||||
autom4te.cache/
|
||||
benchmark
|
||||
config.guess
|
||||
config.h
|
||||
config.h.in
|
||||
config.log
|
||||
config.status
|
||||
config.sub
|
||||
configure
|
||||
db_bench
|
||||
depcomp
|
||||
install-sh
|
||||
leveldbutil
|
||||
libhyperleveldb.pc
|
||||
libtool
|
||||
ltmain.sh
|
||||
m4/
|
||||
Makefile
|
||||
Makefile.in
|
||||
Makefile.old
|
||||
missing
|
||||
stamp-h1
|
||||
/aclocal.m4
|
||||
/autom4te.cache/
|
||||
/benchmark
|
||||
/config.guess
|
||||
/config.h
|
||||
/config.h.in
|
||||
/config.log
|
||||
/config.status
|
||||
/config.sub
|
||||
/configure
|
||||
/db_bench
|
||||
/depcomp
|
||||
/hyperleveldb.upack
|
||||
/install-sh
|
||||
/leveldbutil
|
||||
/libhyperleveldb.pc
|
||||
/libtool
|
||||
/ltmain.sh
|
||||
/m4/
|
||||
/Makefile
|
||||
/Makefile.in
|
||||
/Makefile.old
|
||||
/missing
|
||||
/stamp-h1
|
||||
|
||||
3
AUTHORS
3
AUTHORS
@@ -7,5 +7,8 @@ Google Inc.
|
||||
Jeffrey Dean <jeff@google.com>
|
||||
Sanjay Ghemawat <sanjay@google.com>
|
||||
|
||||
# Partial list of contributors:
|
||||
Kevin Regan <kevin.d.regan@gmail.com>
|
||||
|
||||
# HyperLevelDB authors:
|
||||
Robert Escriva <robert@hyperdex.org>
|
||||
|
||||
26
Makefile.am
26
Makefile.am
@@ -26,8 +26,8 @@
|
||||
## POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS}
|
||||
AM_CFLAGS = -DLEVELDB_PLATFORM_POSIX -fno-builtin-memcmp -fno-builtin-memmove
|
||||
AM_CXXFLAGS = -DLEVELDB_PLATFORM_POSIX -fno-builtin-memcmp -fno-builtin-memmove
|
||||
AM_CFLAGS = -DLEVELDB_PLATFORM_POSIX $(SNAPPY_FLAGS) -fno-builtin-memcmp -fno-builtin-memmove
|
||||
AM_CXXFLAGS = -DLEVELDB_PLATFORM_POSIX $(SNAPPY_FLAGS) -fno-builtin-memcmp -fno-builtin-memmove
|
||||
AM_MAKEFLAGS = --no-print-directory
|
||||
|
||||
pkgconfigdir = $(libdir)/pkgconfig
|
||||
@@ -35,11 +35,19 @@ pkgconfig_DATA = libhyperleveldb.pc
|
||||
|
||||
EXTRA_DIST =
|
||||
EXTRA_DIST += AUTHORS
|
||||
EXTRA_DIST += doc/benchmark.html
|
||||
EXTRA_DIST += doc/doc.css
|
||||
EXTRA_DIST += doc/impl.html
|
||||
EXTRA_DIST += doc/index.html
|
||||
EXTRA_DIST += doc/log_format.txt
|
||||
EXTRA_DIST += doc/table_format.txt
|
||||
EXTRA_DIST += helpers/memenv/memenv.cc
|
||||
EXTRA_DIST += helpers/memenv/memenv_test.cc
|
||||
EXTRA_DIST += LICENSE
|
||||
EXTRA_DIST += NEWS
|
||||
EXTRA_DIST += port/README
|
||||
EXTRA_DIST += README
|
||||
EXTRA_DIST += TODO
|
||||
EXTRA_DIST += port/README
|
||||
|
||||
nobase_include_HEADERS =
|
||||
nobase_include_HEADERS += hyperleveldb/cache.h
|
||||
@@ -51,6 +59,7 @@ nobase_include_HEADERS += hyperleveldb/filter_policy.h
|
||||
nobase_include_HEADERS += hyperleveldb/iterator.h
|
||||
nobase_include_HEADERS += hyperleveldb/options.h
|
||||
nobase_include_HEADERS += hyperleveldb/slice.h
|
||||
nobase_include_HEADERS += hyperleveldb/replay_iterator.h
|
||||
nobase_include_HEADERS += hyperleveldb/status.h
|
||||
nobase_include_HEADERS += hyperleveldb/table_builder.h
|
||||
nobase_include_HEADERS += hyperleveldb/table.h
|
||||
@@ -66,6 +75,7 @@ noinst_HEADERS += db/log_reader.h
|
||||
noinst_HEADERS += db/log_writer.h
|
||||
noinst_HEADERS += db/memtable.h
|
||||
noinst_HEADERS += db/skiplist.h
|
||||
noinst_HEADERS += db/replay_iterator.h
|
||||
noinst_HEADERS += db/snapshot.h
|
||||
noinst_HEADERS += db/table_cache.h
|
||||
noinst_HEADERS += db/version_edit.h
|
||||
@@ -110,6 +120,7 @@ libhyperleveldb_la_SOURCES += db/log_reader.cc
|
||||
libhyperleveldb_la_SOURCES += db/log_writer.cc
|
||||
libhyperleveldb_la_SOURCES += db/memtable.cc
|
||||
libhyperleveldb_la_SOURCES += db/repair.cc
|
||||
libhyperleveldb_la_SOURCES += db/replay_iterator.cc
|
||||
libhyperleveldb_la_SOURCES += db/table_cache.cc
|
||||
libhyperleveldb_la_SOURCES += db/version_edit.cc
|
||||
libhyperleveldb_la_SOURCES += db/version_set.cc
|
||||
@@ -138,6 +149,7 @@ libhyperleveldb_la_SOURCES += util/logging.cc
|
||||
libhyperleveldb_la_SOURCES += util/options.cc
|
||||
libhyperleveldb_la_SOURCES += util/status.cc
|
||||
libhyperleveldb_la_SOURCES += port/port_posix.cc
|
||||
libhyperleveldb_la_LIBADD = $(SNAPPY_LIBS) -lpthread
|
||||
libhyperleveldb_la_LDFLAGS = -pthread
|
||||
|
||||
TESTUTIL = util/testutil.cc
|
||||
@@ -153,6 +165,7 @@ EXTRA_PROGRAMS += db_bench_sqlite3
|
||||
EXTRA_PROGRAMS += db_bench_tree_db
|
||||
|
||||
check_PROGRAMS =
|
||||
check_PROGRAMS += autocompact_test
|
||||
check_PROGRAMS += arena_test
|
||||
check_PROGRAMS += bloom_test
|
||||
check_PROGRAMS += c_test
|
||||
@@ -172,6 +185,7 @@ check_PROGRAMS += version_edit_test
|
||||
check_PROGRAMS += version_set_test
|
||||
check_PROGRAMS += write_batch_test
|
||||
check_PROGRAMS += issue178_test
|
||||
check_PROGRAMS += issue200_test
|
||||
|
||||
TESTS = $(check_PROGRAMS)
|
||||
|
||||
@@ -191,6 +205,9 @@ db_bench_tree_db_LDADD = -lkyotocabinet
|
||||
leveldbutil_SOURCES = db/leveldb_main.cc
|
||||
leveldbutil_LDADD = libhyperleveldb.la -lpthread
|
||||
|
||||
autocompact_test_SOURCES = db/autocompact_test.cc $(TESTHARNESS)
|
||||
autocompact_test_LDADD = libhyperleveldb.la -lpthread
|
||||
|
||||
arena_test_SOURCES = util/arena_test.cc $(TESTHARNESS)
|
||||
arena_test_LDADD = libhyperleveldb.la -lpthread
|
||||
|
||||
@@ -247,3 +264,6 @@ write_batch_test_LDADD = libhyperleveldb.la -lpthread
|
||||
|
||||
issue178_test_SOURCES = issues/issue178_test.cc $(TESTHARNESS)
|
||||
issue178_test_LDADD = libhyperleveldb.la -lpthread
|
||||
|
||||
issue200_test_SOURCES = issues/issue200_test.cc $(TESTHARNESS)
|
||||
issue200_test_LDADD = libhyperleveldb.la -lpthread
|
||||
|
||||
11
README
11
README
@@ -49,3 +49,14 @@ include/env.h
|
||||
include/table.h
|
||||
include/table_builder.h
|
||||
Lower-level modules that most clients probably won't use directly
|
||||
|
||||
Install
|
||||
=======
|
||||
|
||||
Get up and running quickly:
|
||||
|
||||
$ autoreconf -i
|
||||
$ ./configure
|
||||
$ make
|
||||
# make install
|
||||
# ldconfig
|
||||
|
||||
138
benchmark.cc
138
benchmark.cc
@@ -42,7 +42,8 @@
|
||||
#include <po6/threads/thread.h>
|
||||
|
||||
// e
|
||||
#include <e/guard.h>
|
||||
#include <e/popt.h>
|
||||
#include <e/time.h>
|
||||
|
||||
// armnod
|
||||
#include <armnod.h>
|
||||
@@ -50,17 +51,9 @@
|
||||
// numbers
|
||||
#include <numbers.h>
|
||||
|
||||
static long _done = 0;
|
||||
static long _number = 1000000;
|
||||
static long _threads = 1;
|
||||
|
||||
static struct poptOption _popts[] = {
|
||||
{"number", 'n', POPT_ARG_LONG, &_number, 0,
|
||||
"perform N operations against the database (default: 1000000)", "N"},
|
||||
{"threads", 't', POPT_ARG_LONG, &_threads, 0,
|
||||
"run the test with T concurrent threads (default: 1)", "T"},
|
||||
POPT_TABLEEND
|
||||
};
|
||||
static void
|
||||
backup_thread(leveldb::DB*,
|
||||
numbers::throughput_latency_logger* tll);
|
||||
|
||||
static void
|
||||
worker_thread(leveldb::DB*,
|
||||
@@ -68,50 +61,55 @@ worker_thread(leveldb::DB*,
|
||||
const armnod::argparser& k,
|
||||
const armnod::argparser& v);
|
||||
|
||||
static long _done = 0;
|
||||
static long _number = 1000000;
|
||||
static long _threads = 1;
|
||||
static long _backup = 0;
|
||||
static long _write_buf = 64ULL * 1024ULL * 1024ULL;
|
||||
static const char* _output = "benchmark.log";
|
||||
static const char* _dir = ".";
|
||||
|
||||
int
|
||||
main(int argc, const char* argv[])
|
||||
{
|
||||
e::argparser ap;
|
||||
ap.autohelp();
|
||||
ap.arg().name('n', "number")
|
||||
.description("perform N operations against the database (default: 1000000)")
|
||||
.metavar("N")
|
||||
.as_long(&_number);
|
||||
ap.arg().name('t', "threads")
|
||||
.description("run the test with T concurrent threads (default: 1)")
|
||||
.metavar("T")
|
||||
.as_long(&_threads);
|
||||
ap.arg().name('o', "output")
|
||||
.description("output file for benchmark results (default: benchmark.log)")
|
||||
.as_string(&_output);
|
||||
ap.arg().name('d', "db-dir")
|
||||
.description("directory for leveldb storage (default: .)")
|
||||
.as_string(&_dir);
|
||||
ap.arg().name('w', "write-buffer")
|
||||
.description("write buffer size (default: 64MB)")
|
||||
.as_long(&_write_buf);
|
||||
ap.arg().name('b', "backup")
|
||||
.description("perform a live backup every N seconds (default: 0 (no backup))")
|
||||
.as_long(&_backup);
|
||||
armnod::argparser key_parser("key-");
|
||||
armnod::argparser value_parser("value-");
|
||||
std::vector<poptOption> popts;
|
||||
poptOption s[] = {POPT_AUTOHELP {NULL, 0, POPT_ARG_INCLUDE_TABLE, _popts, 0, "Benchmark:", NULL}, POPT_TABLEEND};
|
||||
popts.push_back(s[0]);
|
||||
popts.push_back(s[1]);
|
||||
popts.push_back(key_parser.options("Key Generation:"));
|
||||
popts.push_back(value_parser.options("Value Generation:"));
|
||||
popts.push_back(s[2]);
|
||||
poptContext poptcon;
|
||||
poptcon = poptGetContext(NULL, argc, argv, &popts.front(), POPT_CONTEXT_POSIXMEHARDER);
|
||||
e::guard g = e::makeguard(poptFreeContext, poptcon); g.use_variable();
|
||||
poptSetOtherOptionHelp(poptcon, "[OPTIONS]");
|
||||
ap.add("Key Generation:", key_parser.parser());
|
||||
ap.add("Value Generation:", value_parser.parser());
|
||||
|
||||
int rc;
|
||||
|
||||
while ((rc = poptGetNextOpt(poptcon)) != -1)
|
||||
if (!ap.parse(argc, argv))
|
||||
{
|
||||
switch (rc)
|
||||
{
|
||||
case POPT_ERROR_NOARG:
|
||||
case POPT_ERROR_BADOPT:
|
||||
case POPT_ERROR_BADNUMBER:
|
||||
case POPT_ERROR_OVERFLOW:
|
||||
std::cerr << poptStrerror(rc) << " " << poptBadOption(poptcon, 0) << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
case POPT_ERROR_OPTSTOODEEP:
|
||||
case POPT_ERROR_BADQUOTE:
|
||||
case POPT_ERROR_ERRNO:
|
||||
default:
|
||||
std::cerr << "logic error in argument parsing" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
leveldb::Options opts;
|
||||
opts.create_if_missing = true;
|
||||
opts.write_buffer_size = 64ULL * 1024ULL * 1024ULL;
|
||||
opts.write_buffer_size = write_buf;
|
||||
opts.filter_policy = leveldb::NewBloomFilterPolicy(10);
|
||||
leveldb::DB* db;
|
||||
leveldb::Status st = leveldb::DB::Open(opts, "tmp", &db);
|
||||
leveldb::Status st = leveldb::DB::Open(opts, _dir, &db);
|
||||
|
||||
if (!st.ok())
|
||||
{
|
||||
@@ -121,7 +119,7 @@ main(int argc, const char* argv[])
|
||||
|
||||
numbers::throughput_latency_logger tll;
|
||||
|
||||
if (!tll.open("benchmark.log"))
|
||||
if (!tll.open(_output))
|
||||
{
|
||||
std::cerr << "could not open log: " << strerror(errno) << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
@@ -130,6 +128,13 @@ main(int argc, const char* argv[])
|
||||
typedef std::tr1::shared_ptr<po6::threads::thread> thread_ptr;
|
||||
std::vector<thread_ptr> threads;
|
||||
|
||||
if (_backup > 0)
|
||||
{
|
||||
thread_ptr t(new po6::threads::thread(std::tr1::bind(backup_thread, db, &tll)));
|
||||
threads.push_back(t);
|
||||
t->start();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < _threads; ++i)
|
||||
{
|
||||
thread_ptr t(new po6::threads::thread(std::tr1::bind(worker_thread, db, &tll, key_parser, value_parser)));
|
||||
@@ -175,6 +180,47 @@ get_random()
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define BILLION (1000ULL * 1000ULL * 1000ULL)
|
||||
|
||||
void
|
||||
backup_thread(leveldb::DB* db,
|
||||
numbers::throughput_latency_logger* tll)
|
||||
{
|
||||
uint64_t target = e::time() / BILLION;
|
||||
target += _backup;
|
||||
uint64_t idx = 0;
|
||||
numbers::throughput_latency_logger::thread_state ts;
|
||||
tll->initialize_thread(&ts);
|
||||
|
||||
while (__sync_fetch_and_add(&_done, 0) < _number)
|
||||
{
|
||||
uint64_t now = e::time() / BILLION;
|
||||
|
||||
if (now < target)
|
||||
{
|
||||
timespec ts;
|
||||
ts.tv_sec = 0;
|
||||
ts.tv_nsec = 250ULL * 1000ULL * 1000ULL;
|
||||
nanosleep(&ts, NULL);
|
||||
}
|
||||
else
|
||||
{
|
||||
target = now + _backup;
|
||||
char buf[32];
|
||||
snprintf(buf, 32, "%05lu", idx);
|
||||
buf[31] = '\0';
|
||||
leveldb::Slice name(buf);
|
||||
leveldb::Status st;
|
||||
|
||||
tll->start(&ts, 4);
|
||||
st = db->LiveBackup(name);
|
||||
tll->finish(&ts);
|
||||
assert(st.ok());
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
worker_thread(leveldb::DB* db,
|
||||
numbers::throughput_latency_logger* tll,
|
||||
@@ -196,13 +242,15 @@ worker_thread(leveldb::DB* db,
|
||||
// issue a "get"
|
||||
std::string tmp;
|
||||
leveldb::ReadOptions ropts;
|
||||
tll->start(&ts, 1);
|
||||
leveldb::Status rst = db->Get(ropts, leveldb::Slice(k.data(), k.size()), &tmp);
|
||||
tll->finish(&ts);
|
||||
assert(rst.ok() || rst.IsNotFound());
|
||||
|
||||
// issue a "put"
|
||||
leveldb::WriteOptions wopts;
|
||||
wopts.sync = false;
|
||||
tll->start(&ts, 0);
|
||||
tll->start(&ts, 2);
|
||||
leveldb::Status wst = db->Put(wopts, leveldb::Slice(k.data(), k.size()), leveldb::Slice(v.data(), v.size()));
|
||||
tll->finish(&ts);
|
||||
assert(wst.ok());
|
||||
|
||||
38
configure.ac
38
configure.ac
@@ -30,7 +30,7 @@
|
||||
# Process this file with autoconf to produce a configure script.
|
||||
|
||||
AC_PREREQ([2.62])
|
||||
AC_INIT([hyperleveldb], [2.0.dev], [robert@hyperdex.org])
|
||||
AC_INIT([hyperleveldb], [1.0.dev], [robert@hyperdex.org])
|
||||
AM_INIT_AUTOMAKE([foreign subdir-objects dist-bzip2])
|
||||
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
|
||||
LT_PREREQ([2.2])
|
||||
@@ -58,6 +58,40 @@ AC_FUNC_MMAP
|
||||
AC_CHECK_FUNCS([alarm clock_gettime mach_absolute_time ftruncate memmove mkdir munmap rmdir socket])
|
||||
|
||||
# Optional components
|
||||
snappy_detect_hdr=yes
|
||||
snappy_detect_lib=yes
|
||||
AC_CHECK_LIB([snappy], [snappy_compress], [], [snappy_detect_hdr=no])
|
||||
AC_CHECK_HEADER([snappy.h],,[snappy_detect_lib=no])
|
||||
AC_ARG_ENABLE([snappy], [AS_HELP_STRING([--enable-snappy],
|
||||
[build with Snappy @<:@default: auto@:>@])],
|
||||
[snappy=${enableval}], [snappy=no])
|
||||
if test x"${snappy}" = xyes; then
|
||||
if test x"${snappy_detect_hdr}" != xyes; then
|
||||
AC_MSG_ERROR([
|
||||
-------------------------------------------------
|
||||
LevelDB configured with the Snappy library.
|
||||
libsnappy.so not found
|
||||
Please install Snappy to continue.
|
||||
-------------------------------------------------])
|
||||
fi
|
||||
if test x"${snappy_detect_lib}" != xyes; then
|
||||
AC_MSG_ERROR([
|
||||
-------------------------------------------------
|
||||
LevelDB configured with the Snappy library.
|
||||
snappy.h not found
|
||||
Please install Snappy to continue.
|
||||
-------------------------------------------------])
|
||||
fi
|
||||
fi
|
||||
if test x"${snappy_detect_hdr}" = xyes -a x"${snappy_detect_lib}" = xyes; then
|
||||
SNAPPY_FLAGS=-DSNAPPY
|
||||
SNAPPY_LIBS=-lsnappy
|
||||
else
|
||||
SNAPPY_FLAGS=
|
||||
SNAPPY_LIBS=
|
||||
fi
|
||||
AC_SUBST(SNAPPY_FLAGS)
|
||||
AC_SUBST(SNAPPY_LIBS)
|
||||
|
||||
AC_CONFIG_FILES([Makefile libhyperleveldb.pc])
|
||||
AC_CONFIG_FILES([Makefile libhyperleveldb.pc hyperleveldb.upack])
|
||||
AC_OUTPUT
|
||||
|
||||
123
db/autocompact_test.cc
Normal file
123
db/autocompact_test.cc
Normal file
@@ -0,0 +1,123 @@
|
||||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "hyperleveldb/db.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "hyperleveldb/cache.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace leveldb {
|
||||
|
||||
class AutoCompactTest {
|
||||
public:
|
||||
std::string dbname_;
|
||||
Cache* tiny_cache_;
|
||||
Options options_;
|
||||
DB* db_;
|
||||
|
||||
AutoCompactTest() {
|
||||
dbname_ = test::TmpDir() + "/autocompact_test";
|
||||
tiny_cache_ = NewLRUCache(100);
|
||||
options_.block_cache = tiny_cache_;
|
||||
DestroyDB(dbname_, options_);
|
||||
options_.create_if_missing = true;
|
||||
options_.compression = kNoCompression;
|
||||
ASSERT_OK(DB::Open(options_, dbname_, &db_));
|
||||
}
|
||||
|
||||
~AutoCompactTest() {
|
||||
delete db_;
|
||||
DestroyDB(dbname_, Options());
|
||||
delete tiny_cache_;
|
||||
}
|
||||
|
||||
std::string Key(int i) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "key%06d", i);
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
uint64_t Size(const Slice& start, const Slice& limit) {
|
||||
Range r(start, limit);
|
||||
uint64_t size;
|
||||
db_->GetApproximateSizes(&r, 1, &size);
|
||||
return size;
|
||||
}
|
||||
|
||||
void DoReads(int n);
|
||||
};
|
||||
|
||||
static const int kValueSize = 200 * 1024;
|
||||
static const int kTotalSize = 100 * 1024 * 1024;
|
||||
static const int kCount = kTotalSize / kValueSize;
|
||||
|
||||
// Read through the first n keys repeatedly and check that they get
|
||||
// compacted (verified by checking the size of the key space).
|
||||
void AutoCompactTest::DoReads(int n) {
|
||||
std::string value(kValueSize, 'x');
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
|
||||
// Fill database
|
||||
for (int i = 0; i < kCount; i++) {
|
||||
ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
|
||||
}
|
||||
ASSERT_OK(dbi->TEST_CompactMemTable());
|
||||
|
||||
// Delete everything
|
||||
for (int i = 0; i < kCount; i++) {
|
||||
ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
|
||||
}
|
||||
ASSERT_OK(dbi->TEST_CompactMemTable());
|
||||
|
||||
// Get initial measurement of the space we will be reading.
|
||||
const int64_t initial_size = Size(Key(0), Key(n));
|
||||
const int64_t initial_other_size = Size(Key(n), Key(kCount));
|
||||
|
||||
// Read until size drops significantly.
|
||||
std::string limit_key = Key(n);
|
||||
for (int read = 0; true; read++) {
|
||||
ASSERT_LT(read, 100) << "Taking too long to compact";
|
||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||
for (iter->SeekToFirst();
|
||||
iter->Valid() && iter->key().ToString() < limit_key;
|
||||
iter->Next()) {
|
||||
// Drop data
|
||||
}
|
||||
delete iter;
|
||||
// Wait a little bit to allow any triggered compactions to complete.
|
||||
Env::Default()->SleepForMicroseconds(1000000);
|
||||
uint64_t size = Size(Key(0), Key(n));
|
||||
fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
|
||||
read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
|
||||
if (size <= initial_size/10) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Verify that the size of the key space not touched by the reads
|
||||
// is pretty much unchanged.
|
||||
const int64_t final_other_size = Size(Key(n), Key(kCount));
|
||||
ASSERT_LE(final_other_size, initial_other_size + 1048576);
|
||||
ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
|
||||
}
|
||||
|
||||
TEST(AutoCompactTest, ReadAll) {
|
||||
DoReads(kCount);
|
||||
}
|
||||
|
||||
// HyperLevelDB's ratio-driven compactions always compact everything here. The
|
||||
// reads trigger the compaction, but then the system decides it is more
|
||||
// effiicient to just collect everything, emptying the db completely.
|
||||
#if 0
|
||||
TEST(AutoCompactTest, ReadHalf) {
|
||||
DoReads(kCount/2);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return leveldb::test::RunAllTests();
|
||||
}
|
||||
@@ -35,6 +35,7 @@ class CorruptionTest {
|
||||
CorruptionTest() {
|
||||
tiny_cache_ = NewLRUCache(100);
|
||||
options_.env = &env_;
|
||||
options_.block_cache = tiny_cache_;
|
||||
dbname_ = test::TmpDir() + "/db_test";
|
||||
DestroyDB(dbname_, options_);
|
||||
|
||||
@@ -50,17 +51,14 @@ class CorruptionTest {
|
||||
delete tiny_cache_;
|
||||
}
|
||||
|
||||
Status TryReopen(Options* options = NULL) {
|
||||
Status TryReopen() {
|
||||
delete db_;
|
||||
db_ = NULL;
|
||||
Options opt = (options ? *options : options_);
|
||||
opt.env = &env_;
|
||||
opt.block_cache = tiny_cache_;
|
||||
return DB::Open(opt, dbname_, &db_);
|
||||
return DB::Open(options_, dbname_, &db_);
|
||||
}
|
||||
|
||||
void Reopen(Options* options = NULL) {
|
||||
ASSERT_OK(TryReopen(options));
|
||||
void Reopen() {
|
||||
ASSERT_OK(TryReopen());
|
||||
}
|
||||
|
||||
void RepairDB() {
|
||||
@@ -92,6 +90,10 @@ class CorruptionTest {
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
uint64_t key;
|
||||
Slice in(iter->key());
|
||||
if (in == "" || in == "~") {
|
||||
// Ignore boundary keys.
|
||||
continue;
|
||||
}
|
||||
if (!ConsumeDecimalNumber(&in, &key) ||
|
||||
!in.empty() ||
|
||||
key < next_expected) {
|
||||
@@ -233,7 +235,7 @@ TEST(CorruptionTest, TableFile) {
|
||||
dbi->TEST_CompactRange(1, NULL, NULL);
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Check(99, 99);
|
||||
Check(90, 99);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, TableFileIndexData) {
|
||||
@@ -299,7 +301,7 @@ TEST(CorruptionTest, CompactionInputError) {
|
||||
ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Check(9, 9);
|
||||
Check(5, 9);
|
||||
|
||||
// Force compactions by writing lots of values
|
||||
Build(10000);
|
||||
@@ -307,33 +309,23 @@ TEST(CorruptionTest, CompactionInputError) {
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, CompactionInputErrorParanoid) {
|
||||
Options options;
|
||||
options.paranoid_checks = true;
|
||||
options.write_buffer_size = 1048576;
|
||||
Reopen(&options);
|
||||
options_.paranoid_checks = true;
|
||||
options_.write_buffer_size = 512 << 10;
|
||||
Reopen();
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
|
||||
// Fill levels >= 1 so memtable compaction outputs to level 1
|
||||
for (int level = 1; level < config::kNumLevels; level++) {
|
||||
dbi->Put(WriteOptions(), "", "begin");
|
||||
dbi->Put(WriteOptions(), "~", "end");
|
||||
// Make multiple inputs so we need to compact.
|
||||
for (int i = 0; i < 2; i++) {
|
||||
Build(10);
|
||||
dbi->TEST_CompactMemTable();
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
env_.SleepForMicroseconds(100000);
|
||||
}
|
||||
dbi->CompactRange(NULL, NULL);
|
||||
|
||||
Build(10);
|
||||
dbi->TEST_CompactMemTable();
|
||||
env_.SleepForMicroseconds(1000000);
|
||||
ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Check(9, 9);
|
||||
|
||||
// Write must eventually fail because of corrupted table
|
||||
Status s;
|
||||
// Write must fail because of corrupted table
|
||||
std::string tmp1, tmp2;
|
||||
for (int i = 0; i < 1000000 && s.ok(); i++) {
|
||||
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
|
||||
}
|
||||
Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
|
||||
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
|
||||
}
|
||||
|
||||
|
||||
389
db/db_impl.cc
389
db/db_impl.cc
@@ -5,11 +5,13 @@
|
||||
#include "db_impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
#include "builder.h"
|
||||
#include "db_iter.h"
|
||||
#include "dbformat.h"
|
||||
@@ -17,11 +19,13 @@
|
||||
#include "log_reader.h"
|
||||
#include "log_writer.h"
|
||||
#include "memtable.h"
|
||||
#include "replay_iterator.h"
|
||||
#include "table_cache.h"
|
||||
#include "version_set.h"
|
||||
#include "write_batch_internal.h"
|
||||
#include "../hyperleveldb/db.h"
|
||||
#include "../hyperleveldb/env.h"
|
||||
#include "../hyperleveldb/replay_iterator.h"
|
||||
#include "../hyperleveldb/status.h"
|
||||
#include "../hyperleveldb/table.h"
|
||||
#include "../hyperleveldb/table_builder.h"
|
||||
@@ -35,6 +39,8 @@
|
||||
|
||||
namespace hyperleveldb {
|
||||
|
||||
const int kStraightReads = 50;
|
||||
|
||||
const int kNumNonTableCacheFiles = 10;
|
||||
|
||||
struct DBImpl::CompactionState {
|
||||
@@ -102,22 +108,23 @@ Options SanitizeOptions(const std::string& dbname,
|
||||
return result;
|
||||
}
|
||||
|
||||
DBImpl::DBImpl(const Options& options, const std::string& dbname)
|
||||
: env_(options.env),
|
||||
internal_comparator_(options.comparator),
|
||||
internal_filter_policy_(options.filter_policy),
|
||||
options_(SanitizeOptions(
|
||||
dbname, &internal_comparator_, &internal_filter_policy_, options)),
|
||||
owns_info_log_(options_.info_log != options.info_log),
|
||||
owns_cache_(options_.block_cache != options.block_cache),
|
||||
DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
|
||||
: env_(raw_options.env),
|
||||
internal_comparator_(raw_options.comparator),
|
||||
internal_filter_policy_(raw_options.filter_policy),
|
||||
options_(SanitizeOptions(dbname, &internal_comparator_,
|
||||
&internal_filter_policy_, raw_options)),
|
||||
owns_info_log_(options_.info_log != raw_options.info_log),
|
||||
owns_cache_(options_.block_cache != raw_options.block_cache),
|
||||
dbname_(dbname),
|
||||
db_lock_(NULL),
|
||||
shutting_down_(NULL),
|
||||
mem_(new MemTable(internal_comparator_)),
|
||||
imm_(NULL),
|
||||
logfile_(NULL),
|
||||
logfile_(),
|
||||
logfile_number_(0),
|
||||
log_(NULL),
|
||||
log_(),
|
||||
seed_(0),
|
||||
writers_lower_(0),
|
||||
writers_upper_(0),
|
||||
bg_fg_cv_(&mutex_),
|
||||
@@ -130,17 +137,24 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
|
||||
bg_log_cv_(&mutex_),
|
||||
bg_log_occupied_(false),
|
||||
manual_compaction_(NULL),
|
||||
manual_garbage_cutoff_(raw_options.manual_garbage_collection ?
|
||||
SequenceNumber(0) : kMaxSequenceNumber),
|
||||
straight_reads_(0),
|
||||
backup_cv_(&mutex_),
|
||||
backup_in_progress_(),
|
||||
backup_deferred_delete_(),
|
||||
consecutive_compaction_errors_(0) {
|
||||
mutex_.Lock();
|
||||
mem_->Ref();
|
||||
has_imm_.Release_Store(NULL);
|
||||
backup_in_progress_.Release_Store(NULL);
|
||||
env_->StartThread(&DBImpl::CompactMemTableWrapper, this);
|
||||
env_->StartThread(&DBImpl::CompactOptimisticWrapper, this);
|
||||
env_->StartThread(&DBImpl::CompactLevelWrapper, this);
|
||||
num_bg_threads_ = 3;
|
||||
|
||||
// Reserve ten files or so for other uses and give the rest to TableCache.
|
||||
const int table_cache_size = options.max_open_files - kNumNonTableCacheFiles;
|
||||
const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles;
|
||||
table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
|
||||
versions_ = new VersionSet(dbname_, &options_, table_cache_,
|
||||
&internal_comparator_);
|
||||
@@ -170,8 +184,8 @@ DBImpl::~DBImpl() {
|
||||
delete versions_;
|
||||
if (mem_ != NULL) mem_->Unref();
|
||||
if (imm_ != NULL) imm_->Unref();
|
||||
delete log_;
|
||||
delete logfile_;
|
||||
log_.reset();
|
||||
logfile_.reset();
|
||||
delete table_cache_;
|
||||
|
||||
if (owns_info_log_) {
|
||||
@@ -224,6 +238,16 @@ void DBImpl::MaybeIgnoreError(Status* s) const {
|
||||
}
|
||||
|
||||
void DBImpl::DeleteObsoleteFiles() {
|
||||
// Defer if there's background activity
|
||||
mutex_.AssertHeld();
|
||||
if (backup_in_progress_.Acquire_Load() != NULL) {
|
||||
backup_deferred_delete_ = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// If you ever release mutex_ in this function, you'll need to do more work in
|
||||
// LiveBackup
|
||||
|
||||
// Make a set of all of the live files
|
||||
std::set<uint64_t> live = pending_outputs_;
|
||||
versions_->AddLiveFiles(&live);
|
||||
@@ -651,7 +675,7 @@ void DBImpl::CompactLevelThread() {
|
||||
while (!shutting_down_.Acquire_Load()) {
|
||||
while (!shutting_down_.Acquire_Load() &&
|
||||
manual_compaction_ == NULL &&
|
||||
!versions_->NeedsCompaction(levels_locked_)) {
|
||||
!versions_->NeedsCompaction(levels_locked_, straight_reads_ > kStraightReads)) {
|
||||
bg_compaction_cv_.Wait();
|
||||
}
|
||||
if (shutting_down_.Acquire_Load()) {
|
||||
@@ -709,9 +733,9 @@ Status DBImpl::BackgroundCompaction() {
|
||||
(m->end ? m->end->DebugString().c_str() : "(end)"),
|
||||
(m->done ? "(end)" : manual_end.DebugString().c_str()));
|
||||
} else {
|
||||
int level = versions_->PickCompactionLevel(levels_locked_);
|
||||
int level = versions_->PickCompactionLevel(levels_locked_, straight_reads_ > kStraightReads);
|
||||
if (level != config::kNumLevels) {
|
||||
c = versions_->PickCompaction(level);
|
||||
c = versions_->PickCompaction(versions_->current(), level);
|
||||
}
|
||||
if (c) {
|
||||
assert(!levels_locked_[c->level() + 0]);
|
||||
@@ -831,7 +855,7 @@ Status DBImpl::OptimisticCompaction() {
|
||||
if (levels_locked_[level] || levels_locked_[level + 1]) {
|
||||
continue;
|
||||
}
|
||||
Compaction* tmp = versions_->PickCompaction(level);
|
||||
Compaction* tmp = versions_->PickCompaction(versions_->current(), level);
|
||||
if (tmp && tmp->IsTrivialMove()) {
|
||||
if (c) {
|
||||
delete c;
|
||||
@@ -1055,6 +1079,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
|
||||
std::string current_user_key;
|
||||
bool has_current_user_key = false;
|
||||
SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
|
||||
uint64_t i = 0;
|
||||
for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
|
||||
Slice key = input->key();
|
||||
// Handle key/value, add to state, etc.
|
||||
@@ -1074,6 +1099,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
|
||||
last_sequence_for_key = kMaxSequenceNumber;
|
||||
}
|
||||
|
||||
// Just remember that last_sequence_for_key is decreasing over time, and
|
||||
// all of this makes sense.
|
||||
|
||||
if (last_sequence_for_key <= compact->smallest_snapshot) {
|
||||
// Hidden by an newer entry for same user key
|
||||
drop = true; // (A)
|
||||
@@ -1090,6 +1118,15 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
|
||||
drop = true;
|
||||
}
|
||||
|
||||
// If we're going to drop this key, and there was no previous version of
|
||||
// this key, and it was written at or after the garbage cutoff, we keep
|
||||
// it.
|
||||
if (drop &&
|
||||
last_sequence_for_key == kMaxSequenceNumber &&
|
||||
ikey.sequence >= manual_garbage_cutoff_) {
|
||||
drop = false;
|
||||
}
|
||||
|
||||
last_sequence_for_key = ikey.sequence;
|
||||
}
|
||||
|
||||
@@ -1174,10 +1211,14 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
|
||||
SequenceNumber* latest_snapshot) {
|
||||
Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, uint64_t number,
|
||||
SequenceNumber* latest_snapshot,
|
||||
uint32_t* seed, bool external_sync) {
|
||||
IterState* cleanup = new IterState;
|
||||
mutex_.Lock();
|
||||
if (!external_sync) {
|
||||
mutex_.Lock();
|
||||
}
|
||||
++straight_reads_;
|
||||
*latest_snapshot = versions_->LastSequence();
|
||||
|
||||
// Collect together all needed child iterators
|
||||
@@ -1188,7 +1229,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
|
||||
list.push_back(imm_->NewIterator());
|
||||
imm_->Ref();
|
||||
}
|
||||
versions_->current()->AddIterators(options, &list);
|
||||
versions_->current()->AddSomeIterators(options, number, &list);
|
||||
Iterator* internal_iter =
|
||||
NewMergingIterator(&internal_comparator_, &list[0], list.size());
|
||||
versions_->current()->Ref();
|
||||
@@ -1199,13 +1240,17 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
|
||||
cleanup->version = versions_->current();
|
||||
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
|
||||
|
||||
mutex_.Unlock();
|
||||
*seed = ++seed_;
|
||||
if (!external_sync) {
|
||||
mutex_.Unlock();
|
||||
}
|
||||
return internal_iter;
|
||||
}
|
||||
|
||||
Iterator* DBImpl::TEST_NewInternalIterator() {
|
||||
SequenceNumber ignored;
|
||||
return NewInternalIterator(ReadOptions(), &ignored);
|
||||
uint32_t ignored_seed;
|
||||
return NewInternalIterator(ReadOptions(), 0, &ignored, &ignored_seed, false);
|
||||
}
|
||||
|
||||
int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
|
||||
@@ -1251,6 +1296,10 @@ Status DBImpl::Get(const ReadOptions& options,
|
||||
mutex_.Lock();
|
||||
}
|
||||
|
||||
if (have_stat_update && current->UpdateStats(stats)) {
|
||||
bg_compaction_cv_.Signal();
|
||||
}
|
||||
++straight_reads_;
|
||||
mem->Unref();
|
||||
if (imm != NULL) imm->Unref();
|
||||
current->Unref();
|
||||
@@ -1259,12 +1308,157 @@ Status DBImpl::Get(const ReadOptions& options,
|
||||
|
||||
Iterator* DBImpl::NewIterator(const ReadOptions& options) {
|
||||
SequenceNumber latest_snapshot;
|
||||
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
|
||||
uint32_t seed;
|
||||
Iterator* iter = NewInternalIterator(options, 0, &latest_snapshot, &seed, false);
|
||||
return NewDBIterator(
|
||||
&dbname_, env_, user_comparator(), internal_iter,
|
||||
this, user_comparator(), iter,
|
||||
(options.snapshot != NULL
|
||||
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
|
||||
: latest_snapshot));
|
||||
: latest_snapshot),
|
||||
seed);
|
||||
}
|
||||
|
||||
void DBImpl::GetReplayTimestamp(std::string* timestamp) {
|
||||
uint64_t file = 0;
|
||||
uint64_t seqno = 0;
|
||||
|
||||
{
|
||||
MutexLock l(&mutex_);
|
||||
file = versions_->NewFileNumber();
|
||||
versions_->ReuseFileNumber(file);
|
||||
seqno = versions_->LastSequence();
|
||||
}
|
||||
|
||||
timestamp->clear();
|
||||
PutVarint64(timestamp, file);
|
||||
PutVarint64(timestamp, seqno);
|
||||
}
|
||||
|
||||
void DBImpl::AllowGarbageCollectBeforeTimestamp(const std::string& timestamp) {
|
||||
Slice ts_slice(timestamp);
|
||||
uint64_t file = 0;
|
||||
uint64_t seqno = 0;
|
||||
|
||||
if (timestamp == "all") {
|
||||
// keep zeroes
|
||||
} else if (timestamp == "now") {
|
||||
MutexLock l(&mutex_);
|
||||
seqno = versions_->LastSequence();
|
||||
if (manual_garbage_cutoff_ < seqno) {
|
||||
manual_garbage_cutoff_ = seqno;
|
||||
}
|
||||
} else if (GetVarint64(&ts_slice, &file) &&
|
||||
GetVarint64(&ts_slice, &seqno)) {
|
||||
MutexLock l(&mutex_);
|
||||
if (manual_garbage_cutoff_ < seqno) {
|
||||
manual_garbage_cutoff_ = seqno;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool DBImpl::ValidateTimestamp(const std::string& ts) {
|
||||
uint64_t file = 0;
|
||||
uint64_t seqno = 0;
|
||||
Slice ts_slice(ts);
|
||||
return ts == "all" || ts == "now" ||
|
||||
(GetVarint64(&ts_slice, &file) &&
|
||||
GetVarint64(&ts_slice, &seqno));
|
||||
}
|
||||
|
||||
int DBImpl::CompareTimestamps(const std::string& lhs, const std::string& rhs) {
|
||||
uint64_t now = 0;
|
||||
uint64_t lhs_seqno = 0;
|
||||
uint64_t rhs_seqno = 0;
|
||||
uint64_t tmp;
|
||||
if (lhs == "now" || rhs == "now") {
|
||||
MutexLock l(&mutex_);
|
||||
now = versions_->LastSequence();
|
||||
}
|
||||
if (lhs == "all") {
|
||||
lhs_seqno = 0;
|
||||
} else if (lhs == "now") {
|
||||
lhs_seqno = now;
|
||||
} else {
|
||||
Slice lhs_slice(lhs);
|
||||
GetVarint64(&lhs_slice, &tmp);
|
||||
GetVarint64(&lhs_slice, &lhs_seqno);
|
||||
}
|
||||
if (rhs == "all") {
|
||||
rhs_seqno = 0;
|
||||
} else if (rhs == "now") {
|
||||
rhs_seqno = now;
|
||||
} else {
|
||||
Slice rhs_slice(rhs);
|
||||
GetVarint64(&rhs_slice, &tmp);
|
||||
GetVarint64(&rhs_slice, &rhs_seqno);
|
||||
}
|
||||
|
||||
if (lhs_seqno < rhs_seqno) {
|
||||
return -1;
|
||||
} else if (lhs_seqno > rhs_seqno) {
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
Status DBImpl::GetReplayIterator(const std::string& timestamp,
|
||||
ReplayIterator** iter) {
|
||||
*iter = NULL;
|
||||
Slice ts_slice(timestamp);
|
||||
uint64_t file = 0;
|
||||
uint64_t seqno = 0;
|
||||
|
||||
if (timestamp == "all") {
|
||||
seqno = 0;
|
||||
} else if (timestamp == "now") {
|
||||
MutexLock l(&mutex_);
|
||||
file = versions_->NewFileNumber();
|
||||
versions_->ReuseFileNumber(file);
|
||||
seqno = versions_->LastSequence();
|
||||
} else if (!GetVarint64(&ts_slice, &file) ||
|
||||
!GetVarint64(&ts_slice, &seqno)) {
|
||||
return Status::InvalidArgument("Timestamp is not valid");
|
||||
}
|
||||
|
||||
ReadOptions options;
|
||||
SequenceNumber latest_snapshot;
|
||||
uint32_t seed;
|
||||
MutexLock l(&mutex_);
|
||||
Iterator* internal_iter = NewInternalIterator(options, file, &latest_snapshot, &seed, true);
|
||||
internal_iter->SeekToFirst();
|
||||
ReplayIteratorImpl* iterimpl;
|
||||
iterimpl = new ReplayIteratorImpl(
|
||||
this, &mutex_, user_comparator(), internal_iter, mem_, SequenceNumber(seqno));
|
||||
*iter = iterimpl;
|
||||
replay_iters_.push_back(iterimpl);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void DBImpl::ReleaseReplayIterator(ReplayIterator* _iter) {
|
||||
MutexLock l(&mutex_);
|
||||
ReplayIteratorImpl* iter = reinterpret_cast<ReplayIteratorImpl*>(_iter);
|
||||
for (std::list<ReplayIteratorImpl*>::iterator it = replay_iters_.begin();
|
||||
it != replay_iters_.end(); ++it) {
|
||||
if (*it == iter) {
|
||||
iter->cleanup(); // calls delete
|
||||
replay_iters_.erase(it);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DBImpl::RecordReadSample(Slice key) {
|
||||
MutexLock l(&mutex_);
|
||||
++straight_reads_;
|
||||
if (versions_->current()->RecordReadSample(key)) {
|
||||
bg_compaction_cv_.Signal();
|
||||
}
|
||||
}
|
||||
|
||||
SequenceNumber DBImpl::LastSequence() {
|
||||
MutexLock l(&mutex_);
|
||||
return versions_->LastSequence();
|
||||
}
|
||||
|
||||
const Snapshot* DBImpl::GetSnapshot() {
|
||||
@@ -1294,11 +1488,11 @@ struct DBImpl::Writer {
|
||||
Writer* next;
|
||||
uint64_t start_sequence;
|
||||
uint64_t end_sequence;
|
||||
WritableFile* logfile;
|
||||
log::Writer* log;
|
||||
std::tr1::shared_ptr<WritableFile> logfile;
|
||||
std::tr1::shared_ptr<log::Writer> log;
|
||||
MemTable* mem;
|
||||
WritableFile* old_logfile;
|
||||
log::Writer* old_log;
|
||||
std::tr1::shared_ptr<WritableFile> old_logfile;
|
||||
std::tr1::shared_ptr<log::Writer> old_log;
|
||||
|
||||
explicit Writer()
|
||||
: mtx(),
|
||||
@@ -1307,11 +1501,11 @@ struct DBImpl::Writer {
|
||||
next(NULL),
|
||||
start_sequence(0),
|
||||
end_sequence(0),
|
||||
logfile(NULL),
|
||||
log(NULL),
|
||||
logfile(),
|
||||
log(),
|
||||
mem(NULL),
|
||||
old_logfile(NULL),
|
||||
old_log(NULL) {
|
||||
old_logfile(),
|
||||
old_log() {
|
||||
}
|
||||
~Writer() throw () {
|
||||
}
|
||||
@@ -1345,10 +1539,12 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
|
||||
Status DBImpl::SequenceWriteBegin(Writer* w, WriteBatch* updates) {
|
||||
Status s;
|
||||
MutexLock l(&mutex_);
|
||||
straight_reads_ = 0;
|
||||
bool force = updates == NULL;
|
||||
bool allow_delay = !force;
|
||||
w->old_log = NULL;
|
||||
w->old_logfile = NULL;
|
||||
bool enqueue_mem = false;
|
||||
w->old_log.reset();
|
||||
w->old_logfile.reset();
|
||||
|
||||
while (true) {
|
||||
if (!bg_error_.ok()) {
|
||||
@@ -1380,14 +1576,15 @@ Status DBImpl::SequenceWriteBegin(Writer* w, WriteBatch* updates) {
|
||||
}
|
||||
w->old_log = log_;
|
||||
w->old_logfile = logfile_;
|
||||
logfile_ = lfile;
|
||||
logfile_.reset(lfile);
|
||||
logfile_number_ = new_log_number;
|
||||
log_ = new log::Writer(lfile);
|
||||
log_.reset(new log::Writer(lfile));
|
||||
imm_ = mem_;
|
||||
has_imm_.Release_Store(imm_);
|
||||
mem_ = new MemTable(internal_comparator_);
|
||||
mem_->Ref();
|
||||
force = false; // Do not force another compaction if have room
|
||||
enqueue_mem = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1405,6 +1602,13 @@ Status DBImpl::SequenceWriteBegin(Writer* w, WriteBatch* updates) {
|
||||
w->mem->Ref();
|
||||
}
|
||||
|
||||
if (enqueue_mem) {
|
||||
for (std::list<ReplayIteratorImpl*>::iterator it = replay_iters_.begin();
|
||||
it != replay_iters_.end(); ++it) {
|
||||
(*it)->enqueue(mem_, w->start_sequence);
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
@@ -1429,8 +1633,8 @@ void DBImpl::SequenceWriteEnd(Writer* w) {
|
||||
// must do in order: log, logfile
|
||||
if (w->old_log) {
|
||||
assert(w->old_logfile);
|
||||
delete w->old_log;
|
||||
delete w->old_logfile;
|
||||
w->old_log.reset();
|
||||
w->old_logfile.reset();
|
||||
bg_memtable_cv_.Signal();
|
||||
}
|
||||
|
||||
@@ -1520,6 +1724,107 @@ void DBImpl::GetApproximateSizes(
|
||||
}
|
||||
}
|
||||
|
||||
Status DBImpl::LiveBackup(const Slice& _name) {
|
||||
Slice name = _name;
|
||||
size_t name_sz = 0;
|
||||
|
||||
for (; name_sz < name.size() && name.data()[name_sz] != '\0'; ++name_sz)
|
||||
;
|
||||
|
||||
name = Slice(name.data(), name_sz);
|
||||
std::set<uint64_t> live;
|
||||
uint64_t ticket = __sync_add_and_fetch(&writers_upper_, 1);
|
||||
|
||||
while (__sync_fetch_and_add(&writers_lower_, 0) < ticket)
|
||||
;
|
||||
|
||||
{
|
||||
MutexLock l(&mutex_);
|
||||
versions_->SetLastSequence(ticket);
|
||||
while (backup_in_progress_.Acquire_Load() != NULL) {
|
||||
backup_cv_.Wait();
|
||||
}
|
||||
backup_in_progress_.Release_Store(this);
|
||||
while (bg_log_occupied_) {
|
||||
bg_log_cv_.Wait();
|
||||
}
|
||||
bg_log_occupied_ = true;
|
||||
// note that this logic assumes that DeleteObsoleteFiles never releases
|
||||
// mutex_, so that once we release at this brace, we'll guarantee that it
|
||||
// will see backup_in_progress_. If you change DeleteObsoleteFiles to
|
||||
// release mutex_, you'll need to add some sort of synchronization in place
|
||||
// of this text block.
|
||||
versions_->AddLiveFiles(&live);
|
||||
__sync_fetch_and_add(&writers_lower_, 1);
|
||||
}
|
||||
|
||||
Status s;
|
||||
std::vector<std::string> filenames;
|
||||
s = env_->GetChildren(dbname_, &filenames);
|
||||
std::string backup_dir = dbname_ + "/backup-" + name.ToString() + "/";
|
||||
|
||||
if (s.ok()) {
|
||||
s = env_->CreateDir(backup_dir);
|
||||
}
|
||||
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
|
||||
for (size_t i = 0; i < filenames.size(); i++) {
|
||||
if (!s.ok()) {
|
||||
continue;
|
||||
}
|
||||
if (ParseFileName(filenames[i], &number, &type)) {
|
||||
std::string src = dbname_ + "/" + filenames[i];
|
||||
std::string target = backup_dir + "/" + filenames[i];
|
||||
switch (type) {
|
||||
case kLogFile:
|
||||
case kDescriptorFile:
|
||||
case kCurrentFile:
|
||||
case kInfoLogFile:
|
||||
s = env_->CopyFile(src, target);
|
||||
break;
|
||||
case kTableFile:
|
||||
// If it's a file referenced by a version, we have logged that version
|
||||
// and applied it. Our MANIFEST will reflect that, and the file
|
||||
// number assigned to new files will be greater or equal, ensuring
|
||||
// that they aren't overwritten. Any file not in "live" either exists
|
||||
// past the current manifest (output of ongoing compaction) or so far
|
||||
// in the past we don't care (we're going to delete it at the end of
|
||||
// this backup). I'd rather play safe than sorry.
|
||||
//
|
||||
// Under no circumstances should you collapse this to a single
|
||||
// LinkFile without the conditional as it has implications for backups
|
||||
// that share hardlinks. Opening an older backup that has files
|
||||
// hardlinked with newer backups will overwrite "immutable" files in
|
||||
// the newer backups because they aren't in our manifest, and we do an
|
||||
// open/write rather than a creat/rename. We avoid linking these
|
||||
// files.
|
||||
if (live.find(number) != live.end()) {
|
||||
s = env_->LinkFile(src, target);
|
||||
}
|
||||
break;
|
||||
case kTempFile:
|
||||
case kDBLockFile:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
MutexLock l(&mutex_);
|
||||
backup_in_progress_.Release_Store(NULL);
|
||||
if (s.ok() && backup_deferred_delete_) {
|
||||
DeleteObsoleteFiles();
|
||||
}
|
||||
backup_deferred_delete_ = false;
|
||||
bg_log_occupied_ = false;
|
||||
bg_log_cv_.Signal();
|
||||
backup_cv_.Signal();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
// Default implementations of convenience methods that subclasses of DB
|
||||
// can call if they wish
|
||||
Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
|
||||
@@ -1551,9 +1856,9 @@ Status DB::Open(const Options& options, const std::string& dbname,
|
||||
&lfile);
|
||||
if (s.ok()) {
|
||||
edit.SetLogNumber(new_log_number);
|
||||
impl->logfile_ = lfile;
|
||||
impl->logfile_.reset(lfile);
|
||||
impl->logfile_number_ = new_log_number;
|
||||
impl->log_ = new log::Writer(lfile);
|
||||
impl->log_.reset(new log::Writer(lfile));
|
||||
s = impl->versions_->LogAndApply(&edit, &impl->mutex_, &impl->bg_log_cv_, &impl->bg_log_occupied_);
|
||||
}
|
||||
if (s.ok()) {
|
||||
|
||||
45
db/db_impl.h
45
db/db_impl.h
@@ -6,10 +6,14 @@
|
||||
#define STORAGE_HYPERLEVELDB_DB_DB_IMPL_H_
|
||||
|
||||
#include <deque>
|
||||
#include <list>
|
||||
#include <set>
|
||||
#include <tr1/memory>
|
||||
|
||||
#include "dbformat.h"
|
||||
#include "log_writer.h"
|
||||
#include "snapshot.h"
|
||||
#include "replay_iterator.h"
|
||||
#include "../hyperleveldb/db.h"
|
||||
#include "../hyperleveldb/env.h"
|
||||
#include "../port/port.h"
|
||||
@@ -36,11 +40,19 @@ class DBImpl : public DB {
|
||||
const Slice& key,
|
||||
std::string* value);
|
||||
virtual Iterator* NewIterator(const ReadOptions&);
|
||||
virtual void GetReplayTimestamp(std::string* timestamp);
|
||||
virtual void AllowGarbageCollectBeforeTimestamp(const std::string& timestamp);
|
||||
virtual bool ValidateTimestamp(const std::string& timestamp);
|
||||
virtual int CompareTimestamps(const std::string& lhs, const std::string& rhs);
|
||||
virtual Status GetReplayIterator(const std::string& timestamp,
|
||||
ReplayIterator** iter);
|
||||
virtual void ReleaseReplayIterator(ReplayIterator* iter);
|
||||
virtual const Snapshot* GetSnapshot();
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot);
|
||||
virtual bool GetProperty(const Slice& property, std::string* value);
|
||||
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
|
||||
virtual void CompactRange(const Slice* begin, const Slice* end);
|
||||
virtual Status LiveBackup(const Slice& name);
|
||||
|
||||
// Extra methods (for testing) that are not in the public DB interface
|
||||
|
||||
@@ -59,13 +71,23 @@ class DBImpl : public DB {
|
||||
// file at a level >= 1.
|
||||
int64_t TEST_MaxNextLevelOverlappingBytes();
|
||||
|
||||
// Record a sample of bytes read at the specified internal key.
|
||||
// Samples are taken approximately once every config::kReadBytesPeriod
|
||||
// bytes.
|
||||
void RecordReadSample(Slice key);
|
||||
|
||||
// Peek at the last sequence;
|
||||
// REQURES: mutex_ not held
|
||||
SequenceNumber LastSequence();
|
||||
|
||||
private:
|
||||
friend class DB;
|
||||
struct CompactionState;
|
||||
struct Writer;
|
||||
|
||||
Iterator* NewInternalIterator(const ReadOptions&,
|
||||
SequenceNumber* latest_snapshot);
|
||||
Iterator* NewInternalIterator(const ReadOptions&, uint64_t number,
|
||||
SequenceNumber* latest_snapshot,
|
||||
uint32_t* seed, bool external_sync);
|
||||
|
||||
Status NewDB();
|
||||
|
||||
@@ -139,9 +161,10 @@ class DBImpl : public DB {
|
||||
MemTable* mem_;
|
||||
MemTable* imm_; // Memtable being compacted
|
||||
port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_
|
||||
WritableFile* logfile_;
|
||||
std::tr1::shared_ptr<WritableFile> logfile_;
|
||||
uint64_t logfile_number_;
|
||||
log::Writer* log_;
|
||||
std::tr1::shared_ptr<log::Writer> log_;
|
||||
uint32_t seed_; // For sampling.
|
||||
|
||||
// Synchronize writers
|
||||
uint64_t __attribute__ ((aligned (8))) writers_lower_;
|
||||
@@ -179,8 +202,22 @@ class DBImpl : public DB {
|
||||
};
|
||||
ManualCompaction* manual_compaction_;
|
||||
|
||||
// Where have we pinned tombstones?
|
||||
SequenceNumber manual_garbage_cutoff_;
|
||||
|
||||
// replay iterators
|
||||
std::list<ReplayIteratorImpl*> replay_iters_;
|
||||
|
||||
// how many reads have we done in a row, uninterrupted by writes
|
||||
uint64_t straight_reads_;
|
||||
|
||||
VersionSet* versions_;
|
||||
|
||||
// Information for ongoing backup processes
|
||||
port::CondVar backup_cv_;
|
||||
port::AtomicPointer backup_in_progress_; // non-NULL in progress
|
||||
bool backup_deferred_delete_; // DeleteObsoleteFiles delayed by backup; protect with mutex_
|
||||
|
||||
// Have we encountered a background error in paranoid mode?
|
||||
Status bg_error_;
|
||||
int consecutive_compaction_errors_;
|
||||
|
||||
@@ -5,12 +5,14 @@
|
||||
#include "db_iter.h"
|
||||
|
||||
#include "filename.h"
|
||||
#include "db_impl.h"
|
||||
#include "dbformat.h"
|
||||
#include "../hyperleveldb/env.h"
|
||||
#include "../hyperleveldb/iterator.h"
|
||||
#include "../port/port.h"
|
||||
#include "../util/logging.h"
|
||||
#include "../util/mutexlock.h"
|
||||
#include "../util/random.h"
|
||||
|
||||
namespace hyperleveldb {
|
||||
|
||||
@@ -46,15 +48,16 @@ class DBIter: public Iterator {
|
||||
kReverse
|
||||
};
|
||||
|
||||
DBIter(const std::string* dbname, Env* env,
|
||||
const Comparator* cmp, Iterator* iter, SequenceNumber s)
|
||||
: dbname_(dbname),
|
||||
env_(env),
|
||||
DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
|
||||
uint32_t seed)
|
||||
: db_(db),
|
||||
user_comparator_(cmp),
|
||||
iter_(iter),
|
||||
sequence_(s),
|
||||
direction_(kForward),
|
||||
valid_(false) {
|
||||
valid_(false),
|
||||
rnd_(seed),
|
||||
bytes_counter_(RandomPeriod()) {
|
||||
}
|
||||
virtual ~DBIter() {
|
||||
delete iter_;
|
||||
@@ -100,8 +103,12 @@ class DBIter: public Iterator {
|
||||
}
|
||||
}
|
||||
|
||||
const std::string* const dbname_;
|
||||
Env* const env_;
|
||||
// Pick next gap with average value of config::kReadBytesPeriod.
|
||||
ssize_t RandomPeriod() {
|
||||
return rnd_.Uniform(2*config::kReadBytesPeriod);
|
||||
}
|
||||
|
||||
DBImpl* db_;
|
||||
const Comparator* const user_comparator_;
|
||||
Iterator* const iter_;
|
||||
SequenceNumber const sequence_;
|
||||
@@ -112,13 +119,23 @@ class DBIter: public Iterator {
|
||||
Direction direction_;
|
||||
bool valid_;
|
||||
|
||||
Random rnd_;
|
||||
ssize_t bytes_counter_;
|
||||
|
||||
// No copying allowed
|
||||
DBIter(const DBIter&);
|
||||
void operator=(const DBIter&);
|
||||
};
|
||||
|
||||
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
|
||||
if (!ParseInternalKey(iter_->key(), ikey)) {
|
||||
Slice k = iter_->key();
|
||||
ssize_t n = k.size() + iter_->value().size();
|
||||
bytes_counter_ -= n;
|
||||
while (bytes_counter_ < 0) {
|
||||
bytes_counter_ += RandomPeriod();
|
||||
db_->RecordReadSample(k);
|
||||
}
|
||||
if (!ParseInternalKey(k, ikey)) {
|
||||
status_ = Status::Corruption("corrupted internal key in DBIter");
|
||||
return false;
|
||||
} else {
|
||||
@@ -129,6 +146,9 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
|
||||
void DBIter::Next() {
|
||||
assert(valid_);
|
||||
|
||||
// Temporarily use saved_key_ as storage for key to skip.
|
||||
std::string* skip = &saved_key_;
|
||||
|
||||
if (direction_ == kReverse) { // Switch directions?
|
||||
direction_ = kForward;
|
||||
// iter_ is pointing just before the entries for this->key(),
|
||||
@@ -144,11 +164,13 @@ void DBIter::Next() {
|
||||
saved_key_.clear();
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// if we did not reverse direction, save the current key as the one which we
|
||||
// must advance past. If we did reverse direction, it's already been set,
|
||||
// so we don't need to do it. This conditional solves upstream issue #200.
|
||||
SaveKey(ExtractUserKey(iter_->key()), skip);
|
||||
}
|
||||
|
||||
// Temporarily use saved_key_ as storage for key to skip.
|
||||
std::string* skip = &saved_key_;
|
||||
SaveKey(ExtractUserKey(iter_->key()), skip);
|
||||
FindNextUserEntry(true, skip);
|
||||
}
|
||||
|
||||
@@ -288,12 +310,12 @@ void DBIter::SeekToLast() {
|
||||
} // anonymous namespace
|
||||
|
||||
Iterator* NewDBIterator(
|
||||
const std::string* dbname,
|
||||
Env* env,
|
||||
DBImpl* db,
|
||||
const Comparator* user_key_comparator,
|
||||
Iterator* internal_iter,
|
||||
const SequenceNumber& sequence) {
|
||||
return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
|
||||
SequenceNumber sequence,
|
||||
uint32_t seed) {
|
||||
return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
|
||||
}
|
||||
|
||||
} // namespace hyperleveldb
|
||||
|
||||
@@ -11,15 +11,17 @@
|
||||
|
||||
namespace hyperleveldb {
|
||||
|
||||
class DBImpl;
|
||||
|
||||
// Return a new iterator that converts internal keys (yielded by
|
||||
// "*internal_iter") that were live at the specified "sequence" number
|
||||
// into appropriate user keys.
|
||||
extern Iterator* NewDBIterator(
|
||||
const std::string* dbname,
|
||||
Env* env,
|
||||
DBImpl* db,
|
||||
const Comparator* user_key_comparator,
|
||||
Iterator* internal_iter,
|
||||
const SequenceNumber& sequence);
|
||||
SequenceNumber sequence,
|
||||
uint32_t seed);
|
||||
|
||||
} // namespace hyperleveldb
|
||||
|
||||
|
||||
125
db/db_test.cc
125
db/db_test.cc
@@ -614,6 +614,47 @@ TEST(DBTest, GetPicksCorrectFile) {
|
||||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
#if 0
|
||||
TEST(DBTest, GetEncountersEmptyLevel) {
|
||||
do {
|
||||
// Arrange for the following to happen:
|
||||
// * sstable A in level 0
|
||||
// * nothing in level 1
|
||||
// * sstable B in level 2
|
||||
// Then do enough Get() calls to arrange for an automatic compaction
|
||||
// of sstable A. A bug would cause the compaction to be marked as
|
||||
// occuring at level 1 (instead of the correct level 0).
|
||||
|
||||
// Step 1: First place sstables in levels 0 and 2
|
||||
int compaction_count = 0;
|
||||
while (NumTableFilesAtLevel(0) == 0 ||
|
||||
NumTableFilesAtLevel(2) == 0) {
|
||||
ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2";
|
||||
compaction_count++;
|
||||
Put("a", "begin");
|
||||
Put("z", "end");
|
||||
dbfull()->TEST_CompactMemTable();
|
||||
}
|
||||
|
||||
// Step 2: clear level 1 if necessary.
|
||||
dbfull()->TEST_CompactRange(1, NULL, NULL);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 1);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(1), 0);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(2), 1);
|
||||
|
||||
// Step 3: read a bunch of times
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
ASSERT_EQ("NOT_FOUND", Get("missing"));
|
||||
}
|
||||
|
||||
// Step 4: Wait for compaction to finish
|
||||
env_->SleepForMicroseconds(1000000);
|
||||
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
||||
} while (ChangeOptions());
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(DBTest, IterEmpty) {
|
||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||
|
||||
@@ -1793,6 +1834,23 @@ class ModelDB: public DB {
|
||||
return new ModelIter(snapshot_state, false);
|
||||
}
|
||||
}
|
||||
virtual void GetReplayTimestamp(std::string* timestamp) {
|
||||
}
|
||||
virtual void AllowGarbageCollectBeforeTimestamp(const std::string& timestamp) {
|
||||
}
|
||||
virtual bool ValidateTimestamp(const std::string&) {
|
||||
return false;
|
||||
}
|
||||
virtual int CompareTimestamps(const std::string&, const std::string&) {
|
||||
return 0;
|
||||
}
|
||||
virtual Status GetReplayIterator(const std::string& timestamp,
|
||||
ReplayIterator** iter) {
|
||||
*iter = NULL;
|
||||
return Status::OK();
|
||||
}
|
||||
virtual void ReleaseReplayIterator(ReplayIterator* iter) {
|
||||
}
|
||||
virtual const Snapshot* GetSnapshot() {
|
||||
ModelSnapshot* snapshot = new ModelSnapshot;
|
||||
snapshot->map_ = map_;
|
||||
@@ -1828,6 +1886,8 @@ class ModelDB: public DB {
|
||||
}
|
||||
virtual void CompactRange(const Slice* start, const Slice* end) {
|
||||
}
|
||||
virtual Status LiveBackup(const Slice& name) {
|
||||
}
|
||||
|
||||
private:
|
||||
class ModelIter: public Iterator {
|
||||
@@ -1990,6 +2050,71 @@ TEST(DBTest, Randomized) {
|
||||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
TEST(DBTest, Replay) {
|
||||
std::string ts;
|
||||
db_->GetReplayTimestamp(&ts);
|
||||
ASSERT_OK(Put("key", "v0"));
|
||||
ASSERT_OK(Put("key", "v1"));
|
||||
ASSERT_OK(Put("key", "v2"));
|
||||
ASSERT_OK(Put("key", "v3"));
|
||||
ASSERT_OK(Put("key", "v4"));
|
||||
ASSERT_OK(Put("key", "v5"));
|
||||
ASSERT_OK(Put("key", "v6"));
|
||||
ASSERT_OK(Put("key", "v7"));
|
||||
ASSERT_OK(Put("key", "v8"));
|
||||
ASSERT_OK(Put("key", "v9"));
|
||||
|
||||
// get the iterator
|
||||
ReplayIterator* iter = NULL;
|
||||
ASSERT_OK(db_->GetReplayIterator(ts, &iter));
|
||||
|
||||
// Iterate over what was there to start with
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(iter->HasValue());
|
||||
ASSERT_EQ("key", iter->key().ToString());
|
||||
ASSERT_EQ("v9", iter->value().ToString());
|
||||
iter->Next();
|
||||
// The implementation is allowed to return things twice.
|
||||
// This is a case where it will.
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(iter->HasValue());
|
||||
ASSERT_EQ("key", iter->key().ToString());
|
||||
ASSERT_EQ("v9", iter->value().ToString());
|
||||
iter->Next();
|
||||
// Now it's no longer valid.
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
// Add another and iterate some more
|
||||
ASSERT_OK(Put("key", "v10"));
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(iter->HasValue());
|
||||
ASSERT_EQ("key", iter->key().ToString());
|
||||
ASSERT_EQ("v10", iter->value().ToString());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
// Dump the memtable
|
||||
dbfull()->TEST_CompactMemTable();
|
||||
|
||||
// Write into the new MemTable and iterate some more
|
||||
ASSERT_OK(Put("key", "v11"));
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(iter->HasValue());
|
||||
ASSERT_EQ("key", iter->key().ToString());
|
||||
ASSERT_EQ("v11", iter->value().ToString());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
// What does it do on delete?
|
||||
ASSERT_OK(Delete("key"));
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(!iter->HasValue());
|
||||
ASSERT_EQ("key", iter->key().ToString());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
}
|
||||
|
||||
std::string MakeKey(unsigned int num) {
|
||||
char buf[30];
|
||||
snprintf(buf, sizeof(buf), "%016u", num);
|
||||
|
||||
@@ -40,6 +40,9 @@ static const int kL0_StopWritesTrigger = 12;
|
||||
// space if the same key space is being repeatedly overwritten.
|
||||
static const int kMaxMemCompactLevel = 2;
|
||||
|
||||
// Approximate gap in bytes between samples of data read during iteration.
|
||||
static const int kReadBytesPeriod = 1048576;
|
||||
|
||||
} // namespace config
|
||||
|
||||
class InternalKey;
|
||||
|
||||
@@ -19,6 +19,10 @@ static Slice GetLengthPrefixedSlice(const char* data) {
|
||||
return Slice(p, len);
|
||||
}
|
||||
|
||||
static Slice GetLengthPrefixedSlice(std::pair<uint64_t, const char*> tk) {
|
||||
return GetLengthPrefixedSlice(tk.second);
|
||||
}
|
||||
|
||||
MemTable::MemTable(const InternalKeyComparator& cmp)
|
||||
: comparator_(cmp),
|
||||
refs_(0),
|
||||
@@ -34,11 +38,16 @@ size_t MemTable::ApproximateMemoryUsage() {
|
||||
return arena_.MemoryUsage();
|
||||
}
|
||||
|
||||
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
|
||||
int MemTable::KeyComparator::operator()(TableKey ak, TableKey bk)
|
||||
const {
|
||||
if (ak.first < bk.first) {
|
||||
return -1;
|
||||
} else if (ak.first > bk.first) {
|
||||
return 1;
|
||||
}
|
||||
// Internal keys are encoded as length-prefixed strings.
|
||||
Slice a = GetLengthPrefixedSlice(aptr);
|
||||
Slice b = GetLengthPrefixedSlice(bptr);
|
||||
Slice a = GetLengthPrefixedSlice(ak);
|
||||
Slice b = GetLengthPrefixedSlice(bk);
|
||||
return comparator.Compare(a, b);
|
||||
}
|
||||
|
||||
@@ -54,10 +63,15 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) {
|
||||
|
||||
class MemTableIterator: public Iterator {
|
||||
public:
|
||||
explicit MemTableIterator(MemTable::Table* table) : iter_(table) { }
|
||||
explicit MemTableIterator(MemTable::Table* table,
|
||||
MemTable::KeyComparator* cmp)
|
||||
: iter_(table), comparator_(cmp) { }
|
||||
|
||||
virtual bool Valid() const { return iter_.Valid(); }
|
||||
virtual void Seek(const Slice& k) { iter_.Seek(EncodeKey(&tmp_, k)); }
|
||||
virtual void Seek(const Slice& k) {
|
||||
uint64_t keynum = comparator_->comparator.user_comparator()->KeyNum(Slice(k.data(), k.size() - 8));
|
||||
iter_.Seek(std::make_pair(keynum, EncodeKey(&tmp_, k)));
|
||||
}
|
||||
virtual void SeekToFirst() { iter_.SeekToFirst(); }
|
||||
virtual void SeekToLast() { iter_.SeekToLast(); }
|
||||
virtual void Next() { iter_.Next(); }
|
||||
@@ -72,6 +86,7 @@ class MemTableIterator: public Iterator {
|
||||
|
||||
private:
|
||||
MemTable::Table::Iterator iter_;
|
||||
MemTable::KeyComparator* comparator_;
|
||||
std::string tmp_; // For passing to EncodeKey
|
||||
|
||||
// No copying allowed
|
||||
@@ -80,7 +95,7 @@ class MemTableIterator: public Iterator {
|
||||
};
|
||||
|
||||
Iterator* MemTable::NewIterator() {
|
||||
return new MemTableIterator(&table_);
|
||||
return new MemTableIterator(&table_, &comparator_);
|
||||
}
|
||||
|
||||
void MemTable::Add(SequenceNumber s, ValueType type,
|
||||
@@ -112,18 +127,22 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
||||
p = EncodeVarint32(p, val_size);
|
||||
memcpy(p, value.data(), val_size);
|
||||
assert((p + val_size) - buf == encoded_len);
|
||||
Table::InsertHint ih(&table_, buf);
|
||||
uint64_t keynum = comparator_.comparator.user_comparator()->KeyNum(key);
|
||||
TableKey tk(keynum, buf);
|
||||
Table::InsertHint ih(&table_, tk);
|
||||
|
||||
{
|
||||
MutexLock l(&mtx_);
|
||||
table_.InsertWithHint(&ih, buf);
|
||||
table_.InsertWithHint(&ih, tk);
|
||||
}
|
||||
}
|
||||
|
||||
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
|
||||
Slice memkey = key.memtable_key();
|
||||
Table::Iterator iter(&table_);
|
||||
iter.Seek(memkey.data());
|
||||
uint64_t keynum = comparator_.comparator.user_comparator()->KeyNum(key.user_key());
|
||||
TableKey tk(keynum, memkey.data());
|
||||
iter.Seek(tk);
|
||||
if (iter.Valid()) {
|
||||
// entry format is:
|
||||
// klength varint32
|
||||
@@ -134,10 +153,11 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
|
||||
// Check that it belongs to same user key. We do not check the
|
||||
// sequence number since the Seek() call above should have skipped
|
||||
// all entries with overly large sequence numbers.
|
||||
const char* entry = iter.key();
|
||||
const char* entry = iter.key().second;
|
||||
uint32_t key_length;
|
||||
const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
|
||||
if (comparator_.comparator.user_comparator()->Compare(
|
||||
if (iter.key().first == tk.first &&
|
||||
comparator_.comparator.user_comparator()->Compare(
|
||||
Slice(key_ptr, key_length - 8),
|
||||
key.user_key()) == 0) {
|
||||
// Correct user key
|
||||
|
||||
@@ -67,16 +67,17 @@ class MemTable {
|
||||
|
||||
private:
|
||||
~MemTable(); // Private since only Unref() should be used to delete it
|
||||
typedef std::pair<uint64_t, const char*> TableKey;
|
||||
|
||||
struct KeyComparator {
|
||||
const InternalKeyComparator comparator;
|
||||
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
|
||||
int operator()(const char* a, const char* b) const;
|
||||
int operator()(TableKey a, TableKey b) const;
|
||||
};
|
||||
friend class MemTableIterator;
|
||||
friend class MemTableBackwardIterator;
|
||||
|
||||
typedef SkipList<const char*, KeyComparator> Table;
|
||||
typedef SkipList<TableKey, KeyComparator> Table;
|
||||
|
||||
KeyComparator comparator_;
|
||||
int refs_;
|
||||
|
||||
204
db/replay_iterator.cc
Normal file
204
db/replay_iterator.cc
Normal file
@@ -0,0 +1,204 @@
|
||||
// Copyright (c) 2013 The HyperLevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "replay_iterator.h"
|
||||
|
||||
#include "filename.h"
|
||||
#include "db_impl.h"
|
||||
#include "dbformat.h"
|
||||
#include "../hyperleveldb/env.h"
|
||||
#include "../hyperleveldb/iterator.h"
|
||||
#include "../port/port.h"
|
||||
#include "../util/logging.h"
|
||||
#include "../util/mutexlock.h"
|
||||
#include "../util/random.h"
|
||||
|
||||
namespace hyperleveldb {
|
||||
|
||||
ReplayIterator::ReplayIterator() {
|
||||
}
|
||||
|
||||
ReplayIterator::~ReplayIterator() {
|
||||
}
|
||||
|
||||
ReplayState::ReplayState(Iterator* i, SequenceNumber s, SequenceNumber l)
|
||||
: mem_(NULL),
|
||||
iter_(i),
|
||||
seq_start_(s),
|
||||
seq_limit_(l) {
|
||||
}
|
||||
|
||||
ReplayState::ReplayState(MemTable* m, SequenceNumber s)
|
||||
: mem_(m),
|
||||
iter_(NULL),
|
||||
seq_start_(s),
|
||||
seq_limit_(0) {
|
||||
}
|
||||
|
||||
ReplayIteratorImpl::ReplayIteratorImpl(DBImpl* db, port::Mutex* mutex, const Comparator* cmp,
|
||||
Iterator* iter, MemTable* m, SequenceNumber s)
|
||||
: ReplayIterator(),
|
||||
db_(db),
|
||||
mutex_(mutex),
|
||||
user_comparator_(cmp),
|
||||
start_at_(s),
|
||||
valid_(),
|
||||
status_(),
|
||||
has_current_user_key_(false),
|
||||
current_user_key_(),
|
||||
current_user_sequence_(),
|
||||
rs_(iter, s, kMaxSequenceNumber),
|
||||
mems_() {
|
||||
m->Ref();
|
||||
mems_.push_back(ReplayState(m, s));
|
||||
}
|
||||
|
||||
ReplayIteratorImpl::~ReplayIteratorImpl() {
|
||||
}
|
||||
|
||||
bool ReplayIteratorImpl::Valid() {
|
||||
Prime();
|
||||
return valid_;
|
||||
}
|
||||
|
||||
void ReplayIteratorImpl::Next() {
|
||||
rs_.iter_->Next();
|
||||
}
|
||||
|
||||
bool ReplayIteratorImpl::HasValue() {
|
||||
ParsedInternalKey ikey;
|
||||
return ParseKey(&ikey) && ikey.type == kTypeValue;
|
||||
}
|
||||
|
||||
Slice ReplayIteratorImpl::key() const {
|
||||
assert(valid_);
|
||||
return ExtractUserKey(rs_.iter_->key());
|
||||
}
|
||||
|
||||
Slice ReplayIteratorImpl::value() const {
|
||||
assert(valid_);
|
||||
return rs_.iter_->value();
|
||||
}
|
||||
|
||||
Status ReplayIteratorImpl::status() const {
|
||||
if (!status_.ok()) {
|
||||
return status_;
|
||||
} else {
|
||||
return rs_.iter_->status();
|
||||
}
|
||||
}
|
||||
|
||||
void ReplayIteratorImpl::enqueue(MemTable* m, SequenceNumber s) {
|
||||
m->Ref();
|
||||
mems_.push_back(ReplayState(m, s));
|
||||
}
|
||||
|
||||
void ReplayIteratorImpl::cleanup() {
|
||||
mutex_->Unlock();
|
||||
if (rs_.iter_) {
|
||||
delete rs_.iter_;
|
||||
}
|
||||
if (rs_.mem_) {
|
||||
rs_.mem_->Unref();
|
||||
}
|
||||
mutex_->Lock();
|
||||
rs_.iter_ = NULL;
|
||||
rs_.mem_ = NULL;
|
||||
|
||||
while (!mems_.empty()) {
|
||||
MemTable* mem = mems_.front().mem_;
|
||||
Iterator* iter = mems_.front().iter_;
|
||||
mutex_->Unlock();
|
||||
if (iter) {
|
||||
delete iter;
|
||||
}
|
||||
if (mem) {
|
||||
mem->Unref();
|
||||
}
|
||||
mutex_->Lock();
|
||||
mems_.pop_front();
|
||||
}
|
||||
|
||||
delete this;
|
||||
}
|
||||
|
||||
bool ReplayIteratorImpl::ParseKey(ParsedInternalKey* ikey) {
|
||||
return ParseKey(rs_.iter_->key(), ikey);
|
||||
}
|
||||
|
||||
bool ReplayIteratorImpl::ParseKey(const Slice& k, ParsedInternalKey* ikey) {
|
||||
if (!ParseInternalKey(k, ikey)) {
|
||||
status_ = Status::Corruption("corrupted internal key in ReplayIteratorImpl");
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void ReplayIteratorImpl::Prime() {
|
||||
valid_ = false;
|
||||
if (!status_.ok()) {
|
||||
return;
|
||||
}
|
||||
while (true) {
|
||||
assert(rs_.iter_);
|
||||
while (rs_.iter_->Valid()) {
|
||||
ParsedInternalKey ikey;
|
||||
if (!ParseKey(rs_.iter_->key(), &ikey)) {
|
||||
return;
|
||||
}
|
||||
// if we can consider this key, and it's recent enough and of the right
|
||||
// type
|
||||
if ((!has_current_user_key_ ||
|
||||
user_comparator_->Compare(ikey.user_key,
|
||||
Slice(current_user_key_)) != 0 ||
|
||||
ikey.sequence >= current_user_sequence_) &&
|
||||
(ikey.sequence >= rs_.seq_start_ &&
|
||||
(ikey.type == kTypeDeletion || ikey.type == kTypeValue))) {
|
||||
has_current_user_key_ = true;
|
||||
current_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
|
||||
current_user_sequence_ = ikey.sequence;
|
||||
valid_ = true;
|
||||
return;
|
||||
}
|
||||
rs_.iter_->Next();
|
||||
}
|
||||
if (!rs_.iter_->status().ok()) {
|
||||
status_ = rs_.iter_->status();
|
||||
valid_ = false;
|
||||
return;
|
||||
}
|
||||
// we're done with rs_.iter_
|
||||
has_current_user_key_ = false;
|
||||
current_user_key_.assign("", 0);
|
||||
current_user_sequence_ = kMaxSequenceNumber;
|
||||
delete rs_.iter_;
|
||||
rs_.iter_ = NULL;
|
||||
{
|
||||
MutexLock l(mutex_);
|
||||
if (mems_.empty() ||
|
||||
rs_.seq_limit_ < mems_.front().seq_start_) {
|
||||
rs_.seq_start_ = rs_.seq_limit_;
|
||||
} else {
|
||||
if (rs_.mem_) {
|
||||
rs_.mem_->Unref();
|
||||
rs_.mem_ = NULL;
|
||||
}
|
||||
rs_.mem_ = mems_.front().mem_;
|
||||
rs_.seq_start_ = mems_.front().seq_start_;
|
||||
mems_.pop_front();
|
||||
}
|
||||
}
|
||||
rs_.seq_limit_ = db_->LastSequence();
|
||||
rs_.iter_ = rs_.mem_->NewIterator();
|
||||
rs_.iter_->SeekToFirst();
|
||||
assert(rs_.seq_start_ <= rs_.seq_limit_);
|
||||
if (rs_.seq_start_ == rs_.seq_limit_) {
|
||||
valid_ = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace leveldb
|
||||
71
db/replay_iterator.h
Normal file
71
db/replay_iterator.h
Normal file
@@ -0,0 +1,71 @@
|
||||
// Copyright (c) 2013 The HyperLevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_HYPERLEVELDB_DB_ROLLING_ITERATOR_H_
|
||||
#define STORAGE_HYPERLEVELDB_DB_ROLLING_ITERATOR_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <list>
|
||||
#include "../hyperleveldb/db.h"
|
||||
#include "../hyperleveldb/replay_iterator.h"
|
||||
#include "dbformat.h"
|
||||
#include "memtable.h"
|
||||
|
||||
namespace hyperleveldb {
|
||||
|
||||
class DBImpl;
|
||||
struct ReplayState {
|
||||
ReplayState(Iterator* i, SequenceNumber s, SequenceNumber l);
|
||||
ReplayState(MemTable* m, SequenceNumber s);
|
||||
MemTable* mem_;
|
||||
Iterator* iter_;
|
||||
SequenceNumber seq_start_;
|
||||
SequenceNumber seq_limit_;
|
||||
};
|
||||
|
||||
class ReplayIteratorImpl : public ReplayIterator {
|
||||
public:
|
||||
// Refs the memtable on its own; caller must hold mutex while creating this
|
||||
ReplayIteratorImpl(DBImpl* db, port::Mutex* mutex, const Comparator* cmp,
|
||||
Iterator* iter, MemTable* m, SequenceNumber s);
|
||||
virtual bool Valid();
|
||||
virtual void Next();
|
||||
virtual bool HasValue();
|
||||
virtual Slice key() const;
|
||||
virtual Slice value() const;
|
||||
virtual Status status() const;
|
||||
|
||||
// extra interface
|
||||
|
||||
// we ref the memtable; caller holds mutex passed into ctor
|
||||
// REQUIRES: caller must hold mutex passed into ctor
|
||||
void enqueue(MemTable* m, SequenceNumber s);
|
||||
|
||||
// REQUIRES: caller must hold mutex passed into ctor
|
||||
void cleanup(); // calls delete this;
|
||||
|
||||
private:
|
||||
virtual ~ReplayIteratorImpl();
|
||||
bool ParseKey(ParsedInternalKey* ikey);
|
||||
bool ParseKey(const Slice& k, ParsedInternalKey* ikey);
|
||||
void Prime();
|
||||
|
||||
DBImpl* const db_;
|
||||
port::Mutex* mutex_;
|
||||
const Comparator* const user_comparator_;
|
||||
SequenceNumber const start_at_;
|
||||
bool valid_;
|
||||
Status status_;
|
||||
|
||||
bool has_current_user_key_;
|
||||
std::string current_user_key_;
|
||||
SequenceNumber current_user_sequence_;
|
||||
|
||||
ReplayState rs_;
|
||||
std::list<ReplayState> mems_;
|
||||
};
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_DB_ROLLING_ITERATOR_H_
|
||||
@@ -351,7 +351,7 @@ template<typename Key, class Comparator>
|
||||
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
|
||||
: compare_(cmp),
|
||||
arena_(arena),
|
||||
head_(NewNode(0 /* any key will do */, kMaxHeight)),
|
||||
head_(NewNode(Key(), kMaxHeight)),
|
||||
max_height_(reinterpret_cast<void*>(1)),
|
||||
rnd_(0xdeadbeef) {
|
||||
for (int i = 0; i < kMaxHeight; i++) {
|
||||
|
||||
@@ -176,30 +176,41 @@ bool SomeFileOverlapsRange(
|
||||
// is the largest key that occurs in the file, and value() is an
|
||||
// 16-byte value containing the file number and file size, both
|
||||
// encoded using EncodeFixed64.
|
||||
//
|
||||
// If num != 0, then do not call SeekToLast, Prev
|
||||
class Version::LevelFileNumIterator : public Iterator {
|
||||
public:
|
||||
LevelFileNumIterator(const InternalKeyComparator& icmp,
|
||||
const std::vector<FileMetaData*>* flist)
|
||||
const std::vector<FileMetaData*>* flist,
|
||||
uint64_t num)
|
||||
: icmp_(icmp),
|
||||
flist_(flist),
|
||||
index_(flist->size()) { // Marks as invalid
|
||||
index_(flist->size()), // Marks as invalid
|
||||
number_(num) {
|
||||
}
|
||||
virtual bool Valid() const {
|
||||
return index_ < flist_->size();
|
||||
}
|
||||
virtual void Seek(const Slice& target) {
|
||||
index_ = FindFile(icmp_, *flist_, target);
|
||||
Bump();
|
||||
}
|
||||
virtual void SeekToFirst() {
|
||||
index_ = 0;
|
||||
Bump();
|
||||
}
|
||||
virtual void SeekToFirst() { index_ = 0; }
|
||||
virtual void SeekToLast() {
|
||||
assert(number_ == 0);
|
||||
index_ = flist_->empty() ? 0 : flist_->size() - 1;
|
||||
}
|
||||
virtual void Next() {
|
||||
assert(Valid());
|
||||
index_++;
|
||||
Bump();
|
||||
}
|
||||
virtual void Prev() {
|
||||
assert(Valid());
|
||||
assert(number_ == 0);
|
||||
if (index_ == 0) {
|
||||
index_ = flist_->size(); // Marks as invalid
|
||||
} else {
|
||||
@@ -218,9 +229,16 @@ class Version::LevelFileNumIterator : public Iterator {
|
||||
}
|
||||
virtual Status status() const { return Status::OK(); }
|
||||
private:
|
||||
void Bump() {
|
||||
while (index_ < flist_->size() &&
|
||||
(*flist_)[index_]->number < number_) {
|
||||
++index_;
|
||||
}
|
||||
}
|
||||
const InternalKeyComparator icmp_;
|
||||
const std::vector<FileMetaData*>* const flist_;
|
||||
uint32_t index_;
|
||||
uint64_t number_;
|
||||
|
||||
// Backing store for value(). Holds the file number and size.
|
||||
mutable char value_buf_[16];
|
||||
@@ -241,14 +259,19 @@ static Iterator* GetFileIterator(void* arg,
|
||||
}
|
||||
|
||||
Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
|
||||
int level) const {
|
||||
int level, uint64_t num) const {
|
||||
return NewTwoLevelIterator(
|
||||
new LevelFileNumIterator(vset_->icmp_, &files_[level]),
|
||||
new LevelFileNumIterator(vset_->icmp_, &files_[level], num),
|
||||
&GetFileIterator, vset_->table_cache_, options);
|
||||
}
|
||||
|
||||
void Version::AddIterators(const ReadOptions& options,
|
||||
std::vector<Iterator*>* iters) {
|
||||
return AddSomeIterators(options, 0, iters);
|
||||
}
|
||||
|
||||
void Version::AddSomeIterators(const ReadOptions& options, uint64_t num,
|
||||
std::vector<Iterator*>* iters) {
|
||||
// Merge all level zero files together since they may overlap
|
||||
for (size_t i = 0; i < files_[0].size(); i++) {
|
||||
iters->push_back(
|
||||
@@ -261,7 +284,7 @@ void Version::AddIterators(const ReadOptions& options,
|
||||
// lazily.
|
||||
for (int level = 1; level < config::kNumLevels; level++) {
|
||||
if (!files_[level].empty()) {
|
||||
iters->push_back(NewConcatenatingIterator(options, level));
|
||||
iters->push_back(NewConcatenatingIterator(options, level, num));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -300,6 +323,51 @@ static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
|
||||
return a->number > b->number;
|
||||
}
|
||||
|
||||
void Version::ForEachOverlapping(Slice user_key, Slice internal_key,
|
||||
void* arg,
|
||||
bool (*func)(void*, int, FileMetaData*)) {
|
||||
// TODO(sanjay): Change Version::Get() to use this function.
|
||||
const Comparator* ucmp = vset_->icmp_.user_comparator();
|
||||
|
||||
// Search level-0 in order from newest to oldest.
|
||||
std::vector<FileMetaData*> tmp;
|
||||
tmp.reserve(files_[0].size());
|
||||
for (uint32_t i = 0; i < files_[0].size(); i++) {
|
||||
FileMetaData* f = files_[0][i];
|
||||
if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
|
||||
ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
|
||||
tmp.push_back(f);
|
||||
}
|
||||
}
|
||||
if (!tmp.empty()) {
|
||||
std::sort(tmp.begin(), tmp.end(), NewestFirst);
|
||||
for (uint32_t i = 0; i < tmp.size(); i++) {
|
||||
if (!(*func)(arg, 0, tmp[i])) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Search other levels.
|
||||
for (int level = 1; level < config::kNumLevels; level++) {
|
||||
size_t num_files = files_[level].size();
|
||||
if (num_files == 0) continue;
|
||||
|
||||
// Binary search to find earliest index whose largest key >= internal_key.
|
||||
uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
|
||||
if (index < num_files) {
|
||||
FileMetaData* f = files_[level][index];
|
||||
if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
|
||||
// All of "f" is past any data for user_key
|
||||
} else {
|
||||
if (!(*func)(arg, level, f)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Status Version::Get(const ReadOptions& options,
|
||||
const LookupKey& k,
|
||||
std::string* value,
|
||||
@@ -399,6 +467,57 @@ Status Version::Get(const ReadOptions& options,
|
||||
return Status::NotFound(Slice()); // Use an empty error message for speed
|
||||
}
|
||||
|
||||
bool Version::UpdateStats(const GetStats& stats) {
|
||||
FileMetaData* f = stats.seek_file;
|
||||
if (f != NULL) {
|
||||
f->allowed_seeks--;
|
||||
if (f->allowed_seeks <= 0 && file_to_compact_ == NULL) {
|
||||
file_to_compact_ = f;
|
||||
file_to_compact_level_ = stats.seek_file_level;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Version::RecordReadSample(Slice internal_key) {
|
||||
ParsedInternalKey ikey;
|
||||
if (!ParseInternalKey(internal_key, &ikey)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
struct State {
|
||||
GetStats stats; // Holds first matching file
|
||||
int matches;
|
||||
|
||||
static bool Match(void* arg, int level, FileMetaData* f) {
|
||||
State* state = reinterpret_cast<State*>(arg);
|
||||
state->matches++;
|
||||
if (state->matches == 1) {
|
||||
// Remember first match.
|
||||
state->stats.seek_file = f;
|
||||
state->stats.seek_file_level = level;
|
||||
}
|
||||
// We can stop iterating once we have a second match.
|
||||
return state->matches < 2;
|
||||
}
|
||||
};
|
||||
|
||||
State state;
|
||||
state.matches = 0;
|
||||
ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
|
||||
|
||||
// Must have at least two matches since we want to merge across
|
||||
// files. But what if we have a single file that contains many
|
||||
// overwrites and deletions? Should we have another mechanism for
|
||||
// finding such files?
|
||||
if (state.matches >= 2) {
|
||||
// 1MB cost is about 1 seek (see comment in Builder::Apply).
|
||||
return UpdateStats(state.stats);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void Version::Ref() {
|
||||
++refs_;
|
||||
}
|
||||
@@ -447,6 +566,8 @@ void Version::GetOverlappingInputs(
|
||||
const InternalKey* begin,
|
||||
const InternalKey* end,
|
||||
std::vector<FileMetaData*>* inputs) {
|
||||
assert(level >= 0);
|
||||
assert(level < config::kNumLevels);
|
||||
inputs->clear();
|
||||
Slice user_begin, user_end;
|
||||
if (begin != NULL) {
|
||||
@@ -1180,7 +1301,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
|
||||
} else {
|
||||
// Create concatenating iterator for the files from this level
|
||||
list[num++] = NewTwoLevelIterator(
|
||||
new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
|
||||
new Version::LevelFileNumIterator(icmp_, &c->inputs_[which], 0),
|
||||
&GetFileIterator, table_cache_, options);
|
||||
}
|
||||
}
|
||||
@@ -1199,20 +1320,21 @@ struct CompactionBoundary {
|
||||
};
|
||||
|
||||
struct CmpByRange {
|
||||
CmpByRange(const Comparator* cmp) : cmp_(cmp) {}
|
||||
CmpByRange(const InternalKeyComparator* cmp) : cmp_(cmp) {}
|
||||
bool operator () (const FileMetaData* lhs, const FileMetaData* rhs) {
|
||||
int smallest = cmp_->Compare(lhs->smallest.user_key(), rhs->smallest.user_key());
|
||||
int smallest = cmp_->Compare(lhs->smallest, rhs->smallest);
|
||||
if (smallest == 0) {
|
||||
return cmp_->Compare(lhs->largest.user_key(), rhs->largest.user_key()) < 0;
|
||||
return cmp_->Compare(lhs->largest, rhs->largest) < 0;
|
||||
}
|
||||
return smallest < 0;
|
||||
}
|
||||
private:
|
||||
const Comparator* cmp_;
|
||||
const InternalKeyComparator* cmp_;
|
||||
};
|
||||
|
||||
// Stores the compaction boundaries between level and level + 1
|
||||
void VersionSet::GetCompactionBoundaries(int level,
|
||||
void VersionSet::GetCompactionBoundaries(Version* v,
|
||||
int level,
|
||||
std::vector<FileMetaData*>* LA,
|
||||
std::vector<FileMetaData*>* LB,
|
||||
std::vector<uint64_t>* LA_sizes,
|
||||
@@ -1220,12 +1342,12 @@ void VersionSet::GetCompactionBoundaries(int level,
|
||||
std::vector<CompactionBoundary>* boundaries)
|
||||
{
|
||||
const Comparator* user_cmp = icmp_.user_comparator();
|
||||
*LA = current_->files_[level + 0];
|
||||
*LB = current_->files_[level + 1];
|
||||
*LA = v->files_[level + 0];
|
||||
*LB = v->files_[level + 1];
|
||||
*LA_sizes = std::vector<uint64_t>(LA->size() + 1, 0);
|
||||
*LB_sizes = std::vector<uint64_t>(LB->size() + 1, 0);
|
||||
std::sort(LA->begin(), LA->end(), CmpByRange(user_cmp));
|
||||
std::sort(LB->begin(), LB->end(), CmpByRange(user_cmp));
|
||||
std::sort(LA->begin(), LA->end(), CmpByRange(&icmp_));
|
||||
std::sort(LB->begin(), LB->end(), CmpByRange(&icmp_));
|
||||
boundaries->resize(LA->size());
|
||||
|
||||
// compute sizes
|
||||
@@ -1259,7 +1381,7 @@ void VersionSet::GetCompactionBoundaries(int level,
|
||||
}
|
||||
}
|
||||
|
||||
int VersionSet::PickCompactionLevel(bool* locked) {
|
||||
int VersionSet::PickCompactionLevel(bool* locked, bool seek_driven) const {
|
||||
// Find an unlocked level has score >= 1 where level + 1 has score < 1.
|
||||
int level = config::kNumLevels;
|
||||
for (int i = 0; i + 1 < config::kNumLevels; ++i) {
|
||||
@@ -1267,11 +1389,19 @@ int VersionSet::PickCompactionLevel(bool* locked) {
|
||||
continue;
|
||||
}
|
||||
if (current_->compaction_scores_[i + 0] >= 1.0 &&
|
||||
current_->compaction_scores_[i + 1] < 1.0) {
|
||||
(i + 2 >= config::kNumLevels ||
|
||||
current_->compaction_scores_[i + 1] < 1.0)) {
|
||||
level = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (seek_driven &&
|
||||
level == config::kNumLevels &&
|
||||
current_->file_to_compact_ != NULL &&
|
||||
!locked[current_->file_to_compact_level_ + 0] &&
|
||||
!locked[current_->file_to_compact_level_ + 1]) {
|
||||
level = current_->file_to_compact_level_;
|
||||
}
|
||||
return level;
|
||||
}
|
||||
|
||||
@@ -1279,16 +1409,16 @@ static bool OldestFirst(FileMetaData* a, FileMetaData* b) {
|
||||
return a->number < b->number;
|
||||
}
|
||||
|
||||
Compaction* VersionSet::PickCompaction(int level) {
|
||||
Compaction* VersionSet::PickCompaction(Version* v, int level) {
|
||||
assert(0 <= level && level < config::kNumLevels);
|
||||
bool trivial = false;
|
||||
|
||||
if (current_->files_[level].empty()) {
|
||||
if (v->files_[level].empty()) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Compaction* c = new Compaction(level);
|
||||
c->input_version_ = current_;
|
||||
c->input_version_ = v;
|
||||
c->input_version_->Ref();
|
||||
|
||||
if (level > 0) {
|
||||
@@ -1297,7 +1427,7 @@ Compaction* VersionSet::PickCompaction(int level) {
|
||||
std::vector<uint64_t> LA_sizes;
|
||||
std::vector<uint64_t> LB_sizes;
|
||||
std::vector<CompactionBoundary> boundaries;
|
||||
GetCompactionBoundaries(level, &LA, &LB, &LA_sizes, &LB_sizes, &boundaries);
|
||||
GetCompactionBoundaries(v, level, &LA, &LB, &LA_sizes, &LB_sizes, &boundaries);
|
||||
|
||||
// find the best set of files: maximize the ratio of sizeof(LA)/sizeof(LB)
|
||||
// while keeping sizeof(LA)+sizeof(LB) < some threshold. If there's a tie
|
||||
@@ -1358,6 +1488,9 @@ Compaction* VersionSet::PickCompaction(int level) {
|
||||
c->inputs_[1].push_back(LB[i]);
|
||||
}
|
||||
c->SetRatio(best_ratio);
|
||||
// pick the file to compact in this level
|
||||
} else if (v->file_to_compact_ != NULL) {
|
||||
c->inputs_[0].push_back(v->file_to_compact_);
|
||||
// otherwise just pick the file with least overlap
|
||||
} else {
|
||||
assert(level >= 0);
|
||||
@@ -1379,15 +1512,13 @@ Compaction* VersionSet::PickCompaction(int level) {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::vector<FileMetaData*> tmp(current_->files_[0]);
|
||||
std::vector<FileMetaData*> tmp(v->files_[0]);
|
||||
std::sort(tmp.begin(), tmp.end(), OldestFirst);
|
||||
for (size_t i = 0; i < tmp.size() && c->inputs_[0].size() < 32; ++i) {
|
||||
c->inputs_[0].push_back(tmp[i]);
|
||||
}
|
||||
}
|
||||
|
||||
assert(!c->inputs_[0].empty());
|
||||
|
||||
if (!trivial) {
|
||||
SetupOtherInputs(c);
|
||||
}
|
||||
@@ -1398,13 +1529,13 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
|
||||
const int level = c->level();
|
||||
InternalKey smallest, largest;
|
||||
GetRange(c->inputs_[0], &smallest, &largest);
|
||||
current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
|
||||
c->input_version_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
|
||||
|
||||
// Update the place where we will do the next compaction for this level.
|
||||
// We update this immediately instead of waiting for the VersionEdit
|
||||
// to be applied so that if the compaction fails, we will try a different
|
||||
// key range next time.
|
||||
compact_pointer_[level] = largest.Encode().ToString();
|
||||
//compact_pointer_[level] = largest.Encode().ToString();
|
||||
c->edit_.SetCompactPointer(level, largest);
|
||||
}
|
||||
|
||||
|
||||
@@ -64,6 +64,12 @@ class Version {
|
||||
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
|
||||
void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
|
||||
|
||||
// Append to *iters a sequence of iterators that will
|
||||
// yield a subset of the contents of this Version when merged together.
|
||||
// Yields only files with number greater or equal to num
|
||||
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
|
||||
void AddSomeIterators(const ReadOptions&, uint64_t num, std::vector<Iterator*>* iters);
|
||||
|
||||
// Lookup the value for key. If found, store it in *val and
|
||||
// return OK. Else return a non-OK status. Fills *stats.
|
||||
// REQUIRES: lock is not held
|
||||
@@ -74,6 +80,17 @@ class Version {
|
||||
Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
|
||||
GetStats* stats);
|
||||
|
||||
// Adds "stats" into the current state. Returns true if a new
|
||||
// compaction may need to be triggered, false otherwise.
|
||||
// REQUIRES: lock is held
|
||||
bool UpdateStats(const GetStats& stats);
|
||||
|
||||
// Record a sample of bytes read at the specified internal key.
|
||||
// Samples are taken approximately once every config::kReadBytesPeriod
|
||||
// bytes. Returns true if a new compaction may need to be triggered.
|
||||
// REQUIRES: lock is held
|
||||
bool RecordReadSample(Slice key);
|
||||
|
||||
// Reference count management (so Versions do not disappear out from
|
||||
// under live iterators)
|
||||
void Ref();
|
||||
@@ -108,7 +125,16 @@ class Version {
|
||||
friend class VersionSet;
|
||||
|
||||
class LevelFileNumIterator;
|
||||
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
|
||||
Iterator* NewConcatenatingIterator(const ReadOptions&, int level, uint64_t num) const;
|
||||
|
||||
// Call func(arg, level, f) for every file that overlaps user_key in
|
||||
// order from newest to oldest. If an invocation of func returns
|
||||
// false, makes no more calls.
|
||||
//
|
||||
// REQUIRES: user portion of internal_key == user_key.
|
||||
void ForEachOverlapping(Slice user_key, Slice internal_key,
|
||||
void* arg,
|
||||
bool (*func)(void*, int, FileMetaData*));
|
||||
|
||||
VersionSet* vset_; // VersionSet to which this Version belongs
|
||||
Version* next_; // Next version in linked list
|
||||
@@ -118,13 +144,19 @@ class Version {
|
||||
// List of files per level
|
||||
std::vector<FileMetaData*> files_[config::kNumLevels];
|
||||
|
||||
// Next file to compact based on seek stats.
|
||||
FileMetaData* file_to_compact_;
|
||||
int file_to_compact_level_;
|
||||
|
||||
// Level that should be compacted next and its compaction score.
|
||||
// Score < 1 means compaction is not strictly needed. These fields
|
||||
// are initialized by Finalize().
|
||||
double compaction_scores_[config::kNumLevels];
|
||||
|
||||
explicit Version(VersionSet* vset)
|
||||
: vset_(vset), next_(this), prev_(this), refs_(0) {
|
||||
: vset_(vset), next_(this), prev_(this), refs_(0),
|
||||
file_to_compact_(NULL),
|
||||
file_to_compact_level_(-1) {
|
||||
for (int i = 0; i < config::kNumLevels; ++i) {
|
||||
compaction_scores_[i] = -1;
|
||||
}
|
||||
@@ -202,13 +234,13 @@ class VersionSet {
|
||||
// Pick level for a new compaction.
|
||||
// Returns kNumLevels if there is no compaction to be done.
|
||||
// Otherwise returns the lowest unlocked level that may compact upwards.
|
||||
int PickCompactionLevel(bool* locked);
|
||||
int PickCompactionLevel(bool* locked, bool seek_driven) const;
|
||||
|
||||
// Pick inputs for a new compaction at the specified level.
|
||||
// Returns NULL if there is no compaction to be done.
|
||||
// Otherwise returns a pointer to a heap-allocated object that
|
||||
// describes the compaction. Caller should delete the result.
|
||||
Compaction* PickCompaction(int level);
|
||||
Compaction* PickCompaction(Version* v, int level);
|
||||
|
||||
// Return a compaction object for compacting the range [begin,end] in
|
||||
// the specified level. Returns NULL if there is nothing in that
|
||||
@@ -228,17 +260,8 @@ class VersionSet {
|
||||
Iterator* MakeInputIterator(Compaction* c);
|
||||
|
||||
// Returns true iff some level needs a compaction.
|
||||
bool NeedsCompaction(bool* levels) const {
|
||||
Version* v = current_;
|
||||
for (int i = 0; i + 1 < config::kNumLevels; ++i) {
|
||||
if (!levels[i] && !levels[i + 1] &&
|
||||
v->compaction_scores_[i] >= 1.0 &&
|
||||
(i + 2 == config::kNumLevels ||
|
||||
v->compaction_scores_[i + 1] < 1.0)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
bool NeedsCompaction(bool* levels, bool seek_driven) const {
|
||||
return PickCompactionLevel(levels, seek_driven) != config::kNumLevels;
|
||||
}
|
||||
|
||||
// Add all files listed in any live version to *live.
|
||||
@@ -273,7 +296,8 @@ class VersionSet {
|
||||
InternalKey* smallest,
|
||||
InternalKey* largest);
|
||||
|
||||
void GetCompactionBoundaries(int level,
|
||||
void GetCompactionBoundaries(Version* version,
|
||||
int level,
|
||||
std::vector<FileMetaData*>* LA,
|
||||
std::vector<FileMetaData*>* LB,
|
||||
std::vector<uint64_t>* LA_sizes,
|
||||
|
||||
41
hyperleveldb.upack.in
Normal file
41
hyperleveldb.upack.in
Normal file
@@ -0,0 +1,41 @@
|
||||
package hyperleveldb
|
||||
| source="hyperleveldb"
|
||||
| debian name="libhyperleveldb0"
|
||||
| version="@VERSION@"
|
||||
| release="1"
|
||||
| license="BSD"
|
||||
| copyright="2011-2013 The LevelDB Authors"
|
||||
| homepage="http://hyperdex.org"
|
||||
| tarball="http://hyperdex.org/src/hyperleveldb-{version}.tar.gz"
|
||||
| debian section="libs"
|
||||
| configure="--disable-static"
|
||||
| summary="A fast key-value storage library"
|
||||
+ {libdir}/libhyperleveldb.so.0
|
||||
+ {libdir}/libhyperleveldb.so.0.0.0
|
||||
'''LevelDB is a fast key-value storage library written at Google that provides
|
||||
an ordered mapping from string keys to string values.'''
|
||||
|
||||
subpackage hyperleveldb-devel
|
||||
| debian name="libhyperleveldb-dev"
|
||||
| debian section="libdevel"
|
||||
| debian requires="libhyperleveldb0{self}"
|
||||
| fedora requires="hyperleveldb{self}"
|
||||
| summary="A fast key-value storage library (development files)"
|
||||
+ {includedir}/hyperleveldb/cache.h
|
||||
+ {includedir}/hyperleveldb/c.h
|
||||
+ {includedir}/hyperleveldb/comparator.h
|
||||
+ {includedir}/hyperleveldb/db.h
|
||||
+ {includedir}/hyperleveldb/env.h
|
||||
+ {includedir}/hyperleveldb/filter_policy.h
|
||||
+ {includedir}/hyperleveldb/iterator.h
|
||||
+ {includedir}/hyperleveldb/options.h
|
||||
+ {includedir}/hyperleveldb/replay_iterator.h
|
||||
+ {includedir}/hyperleveldb/slice.h
|
||||
+ {includedir}/hyperleveldb/status.h
|
||||
+ {includedir}/hyperleveldb/table_builder.h
|
||||
+ {includedir}/hyperleveldb/table.h
|
||||
+ {includedir}/hyperleveldb/write_batch.h
|
||||
+ {libdir}/libhyperleveldb.so
|
||||
+ {libdir}/pkgconfig/libhyperleveldb.pc
|
||||
|
||||
exclude {libdir}/libhyperleveldb.la
|
||||
@@ -5,6 +5,7 @@
|
||||
#ifndef STORAGE_HYPERLEVELDB_INCLUDE_COMPARATOR_H_
|
||||
#define STORAGE_HYPERLEVELDB_INCLUDE_COMPARATOR_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
|
||||
namespace hyperleveldb {
|
||||
@@ -51,6 +52,9 @@ class Comparator {
|
||||
// Simple comparator implementations may return with *key unchanged,
|
||||
// i.e., an implementation of this method that does nothing is correct.
|
||||
virtual void FindShortSuccessor(std::string* key) const = 0;
|
||||
|
||||
// If unsure, return 0;
|
||||
virtual uint64_t KeyNum(const Slice& key) const;
|
||||
};
|
||||
|
||||
// Return a builtin comparator that uses lexicographic byte-wise
|
||||
|
||||
@@ -7,14 +7,16 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "iterator.h"
|
||||
#include "options.h"
|
||||
#include "replay_iterator.h"
|
||||
|
||||
namespace hyperleveldb {
|
||||
|
||||
// Update Makefile if you change these
|
||||
static const int kMajorVersion = 1;
|
||||
static const int kMinorVersion = 11;
|
||||
static const int kMinorVersion = 13;
|
||||
|
||||
struct Options;
|
||||
struct ReadOptions;
|
||||
@@ -140,6 +142,36 @@ class DB {
|
||||
// db->CompactRange(NULL, NULL);
|
||||
virtual void CompactRange(const Slice* begin, const Slice* end) = 0;
|
||||
|
||||
// Create a live backup of a live LevelDB instance.
|
||||
// The backup is stored in a directory named "backup-<name>" under the top
|
||||
// level of the open LevelDB database. The implementation is permitted, and
|
||||
// even encouraged, to improve the performance of this call through
|
||||
// hard-links.
|
||||
virtual Status LiveBackup(const Slice& name) = 0;
|
||||
|
||||
// Return an opaque timestamp that identifies the current point in time of the
|
||||
// database. This timestamp may be subsequently presented to the
|
||||
// NewReplayIterator method to create a ReplayIterator.
|
||||
virtual void GetReplayTimestamp(std::string* timestamp) = 0;
|
||||
|
||||
// Set the lower bound for manual garbage collection. This method only takes
|
||||
// effect when Options.manual_garbage_collection is true.
|
||||
virtual void AllowGarbageCollectBeforeTimestamp(const std::string& timestamp) = 0;
|
||||
|
||||
// Validate the timestamp
|
||||
virtual bool ValidateTimestamp(const std::string& timestamp) = 0;
|
||||
|
||||
// Compare two timestamps and return -1, 0, 1 for lt, eq, gt
|
||||
virtual int CompareTimestamps(const std::string& lhs, const std::string& rhs) = 0;
|
||||
|
||||
// Return a ReplayIterator that returns every write operation performed after
|
||||
// the timestamp.
|
||||
virtual Status GetReplayIterator(const std::string& timestamp,
|
||||
ReplayIterator** iter) = 0;
|
||||
|
||||
// Release a previously allocated replay iterator.
|
||||
virtual void ReleaseReplayIterator(ReplayIterator* iter) = 0;
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
DB(const DB&);
|
||||
|
||||
@@ -94,6 +94,15 @@ class Env {
|
||||
virtual Status RenameFile(const std::string& src,
|
||||
const std::string& target) = 0;
|
||||
|
||||
// Copy file src to target.
|
||||
virtual Status CopyFile(const std::string& src,
|
||||
const std::string& target) = 0;
|
||||
|
||||
// Link file src to target.
|
||||
virtual Status LinkFile(const std::string& src,
|
||||
const std::string& target) = 0;
|
||||
|
||||
|
||||
// Lock the specified file. Used to prevent concurrent access to
|
||||
// the same db by multiple processes. On failure, stores NULL in
|
||||
// *lock and returns non-OK.
|
||||
@@ -306,6 +315,12 @@ class EnvWrapper : public Env {
|
||||
Status RenameFile(const std::string& s, const std::string& t) {
|
||||
return target_->RenameFile(s, t);
|
||||
}
|
||||
Status CopyFile(const std::string& s, const std::string& t) {
|
||||
return target_->CopyFile(s, t);
|
||||
}
|
||||
Status LinkFile(const std::string& s, const std::string& t) {
|
||||
return target_->LinkFile(s, t);
|
||||
}
|
||||
Status LockFile(const std::string& f, FileLock** l) {
|
||||
return target_->LockFile(f, l);
|
||||
}
|
||||
|
||||
@@ -135,6 +135,17 @@ struct Options {
|
||||
// Default: NULL
|
||||
const FilterPolicy* filter_policy;
|
||||
|
||||
// Is the database used with the Replay mechanism? If yes, the lower bound on
|
||||
// values to compact is (somewhat) left up to the application; if no, then
|
||||
// LevelDB functions as usual, and uses snapshots to determine the lower
|
||||
// bound. HyperLevelDB will always maintain the integrity of snapshots, so
|
||||
// the application merely has the option to hold data as if it's holding a
|
||||
// snapshot. This just prevents compaction from grabbing data before the app
|
||||
// can get a snapshot.
|
||||
//
|
||||
// Default: false/no.
|
||||
bool manual_garbage_collection;
|
||||
|
||||
// Create an Options object with default values for all fields.
|
||||
Options();
|
||||
};
|
||||
|
||||
57
hyperleveldb/replay_iterator.h
Normal file
57
hyperleveldb/replay_iterator.h
Normal file
@@ -0,0 +1,57 @@
|
||||
// Copyright (c) 2013 The HyperLevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_HYPERLEVELDB_INCLUDE_REPLAY_ITERATOR_H_
|
||||
#define STORAGE_HYPERLEVELDB_INCLUDE_REPLAY_ITERATOR_H_
|
||||
|
||||
#include "slice.h"
|
||||
#include "status.h"
|
||||
|
||||
namespace hyperleveldb {
|
||||
|
||||
class ReplayIterator {
|
||||
public:
|
||||
ReplayIterator();
|
||||
|
||||
// An iterator is either positioned at a deleted key, present key/value pair,
|
||||
// or not valid. This method returns true iff the iterator is valid.
|
||||
virtual bool Valid() = 0;
|
||||
|
||||
// Moves to the next entry in the source. After this call, Valid() is
|
||||
// true iff the iterator was not positioned at the last entry in the source.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Next() = 0;
|
||||
|
||||
// Return true if the current entry points to a key-value pair. If this
|
||||
// returns false, it means the current entry is a deleted entry.
|
||||
virtual bool HasValue() = 0;
|
||||
|
||||
// Return the key for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: Valid()
|
||||
virtual Slice key() const = 0;
|
||||
|
||||
// Return the value for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: !AtEnd() && !AtStart()
|
||||
virtual Slice value() const = 0;
|
||||
|
||||
// If an error has occurred, return it. Else return an ok status.
|
||||
virtual Status status() const = 0;
|
||||
|
||||
protected:
|
||||
// must be released by giving it back to the DB
|
||||
virtual ~ReplayIterator();
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
ReplayIterator(const ReplayIterator&);
|
||||
void operator=(const ReplayIterator&);
|
||||
};
|
||||
|
||||
} // namespace leveldb
|
||||
|
||||
#endif // STORAGE_LEVELDB_INCLUDE_REPLAY_ITERATOR_H_
|
||||
@@ -1,3 +1,7 @@
|
||||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
// Test for issue 178: a manual compaction causes deleted data to reappear.
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
70
issues/issue200_test.cc
Normal file
70
issues/issue200_test.cc
Normal file
@@ -0,0 +1,70 @@
|
||||
// Test for issue 200: iterator reversal brings in keys newer than snapshots
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "hyperleveldb/db.h"
|
||||
#include "hyperleveldb/write_batch.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace {
|
||||
|
||||
class Issue200 { };
|
||||
|
||||
TEST(Issue200, Test) {
|
||||
// Get rid of any state from an old run.
|
||||
std::string dbpath = leveldb::test::TmpDir() + "/leveldb_200_iterator_test";
|
||||
DestroyDB(dbpath, leveldb::Options());
|
||||
|
||||
// Open database.
|
||||
leveldb::DB* db;
|
||||
leveldb::Options db_options;
|
||||
db_options.create_if_missing = true;
|
||||
db_options.compression = leveldb::kNoCompression;
|
||||
ASSERT_OK(leveldb::DB::Open(db_options, dbpath, &db));
|
||||
|
||||
leveldb::WriteOptions write_options;
|
||||
db->Put(write_options, leveldb::Slice("1", 1), leveldb::Slice("b", 1));
|
||||
db->Put(write_options, leveldb::Slice("2", 1), leveldb::Slice("c", 1));
|
||||
db->Put(write_options, leveldb::Slice("3", 1), leveldb::Slice("d", 1));
|
||||
db->Put(write_options, leveldb::Slice("4", 1), leveldb::Slice("e", 1));
|
||||
db->Put(write_options, leveldb::Slice("5", 1), leveldb::Slice("f", 1));
|
||||
|
||||
const leveldb::Snapshot *snapshot = db->GetSnapshot();
|
||||
|
||||
leveldb::ReadOptions read_options;
|
||||
read_options.snapshot = snapshot;
|
||||
leveldb::Iterator *iter = db->NewIterator(read_options);
|
||||
|
||||
// Commenting out this Put changes the iterator behavior,
|
||||
// gives the expected behavior. This is unexpected because
|
||||
// the iterator is taken on a snapshot that was taken
|
||||
// before the Put
|
||||
db->Put(write_options, leveldb::Slice("25", 2), leveldb::Slice("cd", 2));
|
||||
|
||||
iter->Seek(leveldb::Slice("5", 1));
|
||||
ASSERT_EQ("5", iter->key().ToString());
|
||||
iter->Prev();
|
||||
ASSERT_EQ("4", iter->key().ToString());
|
||||
iter->Prev();
|
||||
ASSERT_EQ("3", iter->key().ToString());
|
||||
|
||||
// At this point the iterator is at "3", I expect the Next() call will
|
||||
// move it to "4". But it stays on "3"
|
||||
iter->Next();
|
||||
ASSERT_EQ("4", iter->key().ToString());
|
||||
iter->Next();
|
||||
ASSERT_EQ("5", iter->key().ToString());
|
||||
|
||||
// close database
|
||||
delete iter;
|
||||
db->ReleaseSnapshot(snapshot);
|
||||
delete db;
|
||||
DestroyDB(dbpath, leveldb::Options());
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return leveldb::test::RunAllTests();
|
||||
}
|
||||
@@ -4,15 +4,21 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "../hyperleveldb/comparator.h"
|
||||
#include "../hyperleveldb/slice.h"
|
||||
#include "../port/port.h"
|
||||
#include "coding.h"
|
||||
#include "logging.h"
|
||||
|
||||
namespace hyperleveldb {
|
||||
|
||||
Comparator::~Comparator() { }
|
||||
|
||||
uint64_t Comparator::KeyNum(const Slice& key) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace {
|
||||
class BytewiseComparatorImpl : public Comparator {
|
||||
public:
|
||||
@@ -63,6 +69,22 @@ class BytewiseComparatorImpl : public Comparator {
|
||||
}
|
||||
// *key is a run of 0xffs. Leave it alone.
|
||||
}
|
||||
|
||||
virtual uint64_t KeyNum(const Slice& key) const {
|
||||
unsigned char buf[sizeof(uint64_t)];
|
||||
memset(buf, 0, sizeof(buf));
|
||||
memmove(buf, key.data(), std::min(key.size(), sizeof(uint64_t)));
|
||||
uint64_t number;
|
||||
number = static_cast<uint64_t>(buf[0]) << 56
|
||||
| static_cast<uint64_t>(buf[1]) << 48
|
||||
| static_cast<uint64_t>(buf[2]) << 40
|
||||
| static_cast<uint64_t>(buf[3]) << 32
|
||||
| static_cast<uint64_t>(buf[4]) << 24
|
||||
| static_cast<uint64_t>(buf[5]) << 16
|
||||
| static_cast<uint64_t>(buf[6]) << 8
|
||||
| static_cast<uint64_t>(buf[7]);
|
||||
return number;
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
|
||||
@@ -324,10 +324,42 @@ class PosixMmapFile : public WritableFile {
|
||||
return s;
|
||||
}
|
||||
|
||||
virtual Status Sync() {
|
||||
Status SyncDirIfManifest() {
|
||||
const char* f = filename_.c_str();
|
||||
const char* sep = strrchr(f, '/');
|
||||
Slice basename;
|
||||
std::string dir;
|
||||
if (sep == NULL) {
|
||||
dir = ".";
|
||||
basename = f;
|
||||
} else {
|
||||
dir = std::string(f, sep - f);
|
||||
basename = sep + 1;
|
||||
}
|
||||
Status s;
|
||||
if (basename.starts_with("MANIFEST")) {
|
||||
int fd = open(dir.c_str(), O_RDONLY);
|
||||
if (fd < 0) {
|
||||
s = IOError(dir, errno);
|
||||
} else {
|
||||
if (fsync(fd) < 0) {
|
||||
s = IOError(dir, errno);
|
||||
}
|
||||
close(fd);
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
virtual Status Sync() {
|
||||
// Ensure new files referred to by the manifest are in the filesystem.
|
||||
Status s = SyncDirIfManifest();
|
||||
bool need_sync = false;
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
{
|
||||
MutexLock l(&mtx_);
|
||||
need_sync = sync_offset_ != end_offset_;
|
||||
@@ -504,6 +536,54 @@ class PosixEnv : public Env {
|
||||
return result;
|
||||
}
|
||||
|
||||
virtual Status CopyFile(const std::string& src, const std::string& target) {
|
||||
Status result;
|
||||
int fd1;
|
||||
int fd2;
|
||||
|
||||
if (result.ok() && (fd1 = open(src.c_str(), O_RDONLY)) < 0) {
|
||||
result = IOError(src, errno);
|
||||
}
|
||||
if (result.ok() && (fd2 = open(target.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0) {
|
||||
result = IOError(target, errno);
|
||||
}
|
||||
|
||||
ssize_t amt = 0;
|
||||
char buf[512];
|
||||
|
||||
while (result.ok() && (amt = read(fd1, buf, 512)) > 0) {
|
||||
if (write(fd2, buf, amt) != amt) {
|
||||
result = IOError(src, errno);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.ok() && amt < 0) {
|
||||
result = IOError(src, errno);
|
||||
}
|
||||
|
||||
if (fd1 >= 0 && close(fd1) < 0) {
|
||||
if (result.ok()) {
|
||||
result = IOError(src, errno);
|
||||
}
|
||||
}
|
||||
|
||||
if (fd2 >= 0 && close(fd2) < 0) {
|
||||
if (result.ok()) {
|
||||
result = IOError(target, errno);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
virtual Status LinkFile(const std::string& src, const std::string& target) {
|
||||
Status result;
|
||||
if (link(src.c_str(), target.c_str()) != 0) {
|
||||
result = IOError(src, errno);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
virtual Status LockFile(const std::string& fname, FileLock** lock) {
|
||||
*lock = NULL;
|
||||
Status result;
|
||||
|
||||
@@ -22,7 +22,8 @@ Options::Options()
|
||||
block_size(4096),
|
||||
block_restart_interval(16),
|
||||
compression(kSnappyCompression),
|
||||
filter_policy(NULL) {
|
||||
filter_policy(NULL),
|
||||
manual_garbage_collection(false) {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -16,7 +16,12 @@ class Random {
|
||||
private:
|
||||
uint32_t seed_;
|
||||
public:
|
||||
explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { }
|
||||
explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) {
|
||||
// Avoid bad seeds.
|
||||
if (seed_ == 0 || seed_ == 2147483647L) {
|
||||
seed_ = 1;
|
||||
}
|
||||
}
|
||||
uint32_t Next() {
|
||||
static const uint32_t M = 2147483647L; // 2^31-1
|
||||
static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0
|
||||
|
||||
Reference in New Issue
Block a user