Upgrade LevelDB from 1.12 to 1.14:

* Fix issues 77, 87, 182, 190, 200, and 201

* Fix bug described in https://groups.google.com/d/msg/leveldb/yL6h1mAOc20/vLU64RylIdMJ

* Fix a bug where options.max_open_files was not getting clamped properly.

* Fix link to bigtable paper in docs.

* New sstables will have the file extension .ldb. .sst files will continue to be recognized.
This commit is contained in:
David Grogan
2013-08-21 11:12:47 -07:00
committed by JoelKatz
parent 474b824902
commit 97346c6618
23 changed files with 498 additions and 78 deletions

118
db/autocompact_test.cc Normal file
View File

@@ -0,0 +1,118 @@
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/db.h"
#include "db/db_impl.h"
#include "leveldb/cache.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
class AutoCompactTest {
public:
std::string dbname_;
Cache* tiny_cache_;
Options options_;
DB* db_;
AutoCompactTest() {
dbname_ = test::TmpDir() + "/autocompact_test";
tiny_cache_ = NewLRUCache(100);
options_.block_cache = tiny_cache_;
DestroyDB(dbname_, options_);
options_.create_if_missing = true;
options_.compression = kNoCompression;
ASSERT_OK(DB::Open(options_, dbname_, &db_));
}
~AutoCompactTest() {
delete db_;
DestroyDB(dbname_, Options());
delete tiny_cache_;
}
std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key%06d", i);
return std::string(buf);
}
uint64_t Size(const Slice& start, const Slice& limit) {
Range r(start, limit);
uint64_t size;
db_->GetApproximateSizes(&r, 1, &size);
return size;
}
void DoReads(int n);
};
static const int kValueSize = 200 * 1024;
static const int kTotalSize = 100 * 1024 * 1024;
static const int kCount = kTotalSize / kValueSize;
// Read through the first n keys repeatedly and check that they get
// compacted (verified by checking the size of the key space).
void AutoCompactTest::DoReads(int n) {
std::string value(kValueSize, 'x');
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
// Fill database
for (int i = 0; i < kCount; i++) {
ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
}
ASSERT_OK(dbi->TEST_CompactMemTable());
// Delete everything
for (int i = 0; i < kCount; i++) {
ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
}
ASSERT_OK(dbi->TEST_CompactMemTable());
// Get initial measurement of the space we will be reading.
const int64_t initial_size = Size(Key(0), Key(n));
const int64_t initial_other_size = Size(Key(n), Key(kCount));
// Read until size drops significantly.
std::string limit_key = Key(n);
for (int read = 0; true; read++) {
ASSERT_LT(read, 100) << "Taking too long to compact";
Iterator* iter = db_->NewIterator(ReadOptions());
for (iter->SeekToFirst();
iter->Valid() && iter->key().ToString() < limit_key;
iter->Next()) {
// Drop data
}
delete iter;
// Wait a little bit to allow any triggered compactions to complete.
Env::Default()->SleepForMicroseconds(1000000);
uint64_t size = Size(Key(0), Key(n));
fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
if (size <= initial_size/10) {
break;
}
}
// Verify that the size of the key space not touched by the reads
// is pretty much unchanged.
const int64_t final_other_size = Size(Key(n), Key(kCount));
ASSERT_LE(final_other_size, initial_other_size + 1048576);
ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
}
TEST(AutoCompactTest, ReadAll) {
DoReads(kCount);
}
TEST(AutoCompactTest, ReadHalf) {
DoReads(kCount/2);
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

59
issues/issue200_test.cc Normal file
View File

@@ -0,0 +1,59 @@
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// Test for issue 200: when iterator switches direction from backward
// to forward, the current key can be yielded unexpectedly if a new
// mutation has been added just before the current key.
#include "leveldb/db.h"
#include "util/testharness.h"
namespace leveldb {
class Issue200 { };
TEST(Issue200, Test) {
// Get rid of any state from an old run.
std::string dbpath = test::TmpDir() + "/leveldb_issue200_test";
DestroyDB(dbpath, Options());
DB *db;
Options options;
options.create_if_missing = true;
ASSERT_OK(DB::Open(options, dbpath, &db));
WriteOptions write_options;
ASSERT_OK(db->Put(write_options, "1", "b"));
ASSERT_OK(db->Put(write_options, "2", "c"));
ASSERT_OK(db->Put(write_options, "3", "d"));
ASSERT_OK(db->Put(write_options, "4", "e"));
ASSERT_OK(db->Put(write_options, "5", "f"));
ReadOptions read_options;
Iterator *iter = db->NewIterator(read_options);
// Add an element that should not be reflected in the iterator.
ASSERT_OK(db->Put(write_options, "25", "cd"));
iter->Seek("5");
ASSERT_EQ(iter->key().ToString(), "5");
iter->Prev();
ASSERT_EQ(iter->key().ToString(), "4");
iter->Prev();
ASSERT_EQ(iter->key().ToString(), "3");
iter->Next();
ASSERT_EQ(iter->key().ToString(), "4");
iter->Next();
ASSERT_EQ(iter->key().ToString(), "5");
delete iter;
delete db;
DestroyDB(dbpath, options);
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View File

@@ -11,7 +11,7 @@
The implementation of leveldb is similar in spirit to the
representation of a single
<a href="http://labs.google.com/papers/bigtable.html">
<a href="http://research.google.com/archive/bigtable.html">
Bigtable tablet (section 5.3)</a>.
However the organization of the files that make up the representation
is somewhat different and is explained below.

View File

@@ -9,3 +9,4 @@ Sanjay Ghemawat <sanjay@google.com>
# Partial list of contributors:
Kevin Regan <kevin.d.regan@gmail.com>
Johan Bilien <jobi@litl.com>

View File

@@ -31,6 +31,7 @@ TESTHARNESS = ./util/testharness.o $(TESTUTIL)
TESTS = \
arena_test \
autocompact_test \
bloom_test \
c_test \
cache_test \
@@ -43,6 +44,7 @@ TESTS = \
filename_test \
filter_block_test \
issue178_test \
issue200_test \
log_test \
memenv_test \
skiplist_test \
@@ -70,7 +72,7 @@ SHARED = $(SHARED1)
else
# Update db.h if you change these.
SHARED_MAJOR = 1
SHARED_MINOR = 12
SHARED_MINOR = 14
SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
@@ -114,6 +116,9 @@ leveldbutil: db/leveldb_main.o $(LIBOBJECTS)
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
autocompact_test: db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
@@ -150,6 +155,9 @@ filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
issue178_test: issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
issue200_test: issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
@@ -187,14 +195,14 @@ IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBu
mkdir -p ios-x86/$(dir $@)
$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@
.c.o:
mkdir -p ios-x86/$(dir $@)
$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@
else

View File

@@ -35,6 +35,7 @@ class CorruptionTest {
CorruptionTest() {
tiny_cache_ = NewLRUCache(100);
options_.env = &env_;
options_.block_cache = tiny_cache_;
dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, options_);
@@ -50,17 +51,14 @@ class CorruptionTest {
delete tiny_cache_;
}
Status TryReopen(Options* options = NULL) {
Status TryReopen() {
delete db_;
db_ = NULL;
Options opt = (options ? *options : options_);
opt.env = &env_;
opt.block_cache = tiny_cache_;
return DB::Open(opt, dbname_, &db_);
return DB::Open(options_, dbname_, &db_);
}
void Reopen(Options* options = NULL) {
ASSERT_OK(TryReopen(options));
void Reopen() {
ASSERT_OK(TryReopen());
}
void RepairDB() {
@@ -92,6 +90,10 @@ class CorruptionTest {
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
uint64_t key;
Slice in(iter->key());
if (in == "" || in == "~") {
// Ignore boundary keys.
continue;
}
if (!ConsumeDecimalNumber(&in, &key) ||
!in.empty() ||
key < next_expected) {
@@ -233,7 +235,7 @@ TEST(CorruptionTest, TableFile) {
dbi->TEST_CompactRange(1, NULL, NULL);
Corrupt(kTableFile, 100, 1);
Check(99, 99);
Check(90, 99);
}
TEST(CorruptionTest, TableFileIndexData) {
@@ -299,7 +301,7 @@ TEST(CorruptionTest, CompactionInputError) {
ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
Corrupt(kTableFile, 100, 1);
Check(9, 9);
Check(5, 9);
// Force compactions by writing lots of values
Build(10000);
@@ -307,32 +309,23 @@ TEST(CorruptionTest, CompactionInputError) {
}
TEST(CorruptionTest, CompactionInputErrorParanoid) {
Options options;
options.paranoid_checks = true;
options.write_buffer_size = 1048576;
Reopen(&options);
options_.paranoid_checks = true;
options_.write_buffer_size = 512 << 10;
Reopen();
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
// Fill levels >= 1 so memtable compaction outputs to level 1
for (int level = 1; level < config::kNumLevels; level++) {
dbi->Put(WriteOptions(), "", "begin");
dbi->Put(WriteOptions(), "~", "end");
// Make multiple inputs so we need to compact.
for (int i = 0; i < 2; i++) {
Build(10);
dbi->TEST_CompactMemTable();
Corrupt(kTableFile, 100, 1);
env_.SleepForMicroseconds(100000);
}
dbi->CompactRange(NULL, NULL);
Build(10);
dbi->TEST_CompactMemTable();
ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
Corrupt(kTableFile, 100, 1);
Check(9, 9);
// Write must eventually fail because of corrupted table
Status s;
// Write must fail because of corrupted table
std::string tmp1, tmp2;
for (int i = 0; i < 10000 && s.ok(); i++) {
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
}
Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
}

View File

@@ -113,14 +113,14 @@ Options SanitizeOptions(const std::string& dbname,
return result;
}
DBImpl::DBImpl(const Options& options, const std::string& dbname)
: env_(options.env),
internal_comparator_(options.comparator),
internal_filter_policy_(options.filter_policy),
options_(SanitizeOptions(
dbname, &internal_comparator_, &internal_filter_policy_, options)),
owns_info_log_(options_.info_log != options.info_log),
owns_cache_(options_.block_cache != options.block_cache),
DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
: env_(raw_options.env),
internal_comparator_(raw_options.comparator),
internal_filter_policy_(raw_options.filter_policy),
options_(SanitizeOptions(dbname, &internal_comparator_,
&internal_filter_policy_, raw_options)),
owns_info_log_(options_.info_log != raw_options.info_log),
owns_cache_(options_.block_cache != raw_options.block_cache),
dbname_(dbname),
db_lock_(NULL),
shutting_down_(NULL),
@@ -130,6 +130,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
logfile_(NULL),
logfile_number_(0),
log_(NULL),
seed_(0),
tmp_batch_(new WriteBatch),
bg_compaction_scheduled_(false),
manual_compaction_(NULL),
@@ -138,7 +139,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
has_imm_.Release_Store(NULL);
// Reserve ten files or so for other uses and give the rest to TableCache.
const int table_cache_size = options.max_open_files - kNumNonTableCacheFiles;
const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles;
table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
versions_ = new VersionSet(dbname_, &options_, table_cache_,
@@ -1027,7 +1028,8 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
} // namespace
Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
SequenceNumber* latest_snapshot) {
SequenceNumber* latest_snapshot,
uint32_t* seed) {
IterState* cleanup = new IterState;
mutex_.Lock();
*latest_snapshot = versions_->LastSequence();
@@ -1051,13 +1053,15 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
cleanup->version = versions_->current();
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
*seed = ++seed_;
mutex_.Unlock();
return internal_iter;
}
Iterator* DBImpl::TEST_NewInternalIterator() {
SequenceNumber ignored;
return NewInternalIterator(ReadOptions(), &ignored);
uint32_t ignored_seed;
return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed);
}
int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
@@ -1114,12 +1118,21 @@ Status DBImpl::Get(const ReadOptions& options,
Iterator* DBImpl::NewIterator(const ReadOptions& options) {
SequenceNumber latest_snapshot;
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
uint32_t seed;
Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed);
return NewDBIterator(
&dbname_, env_, user_comparator(), internal_iter,
this, user_comparator(), iter,
(options.snapshot != NULL
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
: latest_snapshot));
: latest_snapshot),
seed);
}
void DBImpl::RecordReadSample(Slice key) {
MutexLock l(&mutex_);
if (versions_->current()->RecordReadSample(key)) {
MaybeScheduleCompaction();
}
}
const Snapshot* DBImpl::GetSnapshot() {

View File

@@ -59,13 +59,19 @@ class DBImpl : public DB {
// file at a level >= 1.
int64_t TEST_MaxNextLevelOverlappingBytes();
// Record a sample of bytes read at the specified internal key.
// Samples are taken approximately once every config::kReadBytesPeriod
// bytes.
void RecordReadSample(Slice key);
private:
friend class DB;
struct CompactionState;
struct Writer;
Iterator* NewInternalIterator(const ReadOptions&,
SequenceNumber* latest_snapshot);
SequenceNumber* latest_snapshot,
uint32_t* seed);
Status NewDB();
@@ -135,6 +141,7 @@ class DBImpl : public DB {
WritableFile* logfile_;
uint64_t logfile_number_;
log::Writer* log_;
uint32_t seed_; // For sampling.
// Queue of writers.
std::deque<Writer*> writers_;

View File

@@ -5,12 +5,14 @@
#include "db/db_iter.h"
#include "db/filename.h"
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "leveldb/env.h"
#include "leveldb/iterator.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/random.h"
namespace leveldb {
@@ -46,15 +48,16 @@ class DBIter: public Iterator {
kReverse
};
DBIter(const std::string* dbname, Env* env,
const Comparator* cmp, Iterator* iter, SequenceNumber s)
: dbname_(dbname),
env_(env),
DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
uint32_t seed)
: db_(db),
user_comparator_(cmp),
iter_(iter),
sequence_(s),
direction_(kForward),
valid_(false) {
valid_(false),
rnd_(seed),
bytes_counter_(RandomPeriod()) {
}
virtual ~DBIter() {
delete iter_;
@@ -100,8 +103,12 @@ class DBIter: public Iterator {
}
}
const std::string* const dbname_;
Env* const env_;
// Pick next gap with average value of config::kReadBytesPeriod.
ssize_t RandomPeriod() {
return rnd_.Uniform(2*config::kReadBytesPeriod);
}
DBImpl* db_;
const Comparator* const user_comparator_;
Iterator* const iter_;
SequenceNumber const sequence_;
@@ -112,13 +119,23 @@ class DBIter: public Iterator {
Direction direction_;
bool valid_;
Random rnd_;
ssize_t bytes_counter_;
// No copying allowed
DBIter(const DBIter&);
void operator=(const DBIter&);
};
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
if (!ParseInternalKey(iter_->key(), ikey)) {
Slice k = iter_->key();
ssize_t n = k.size() + iter_->value().size();
bytes_counter_ -= n;
while (bytes_counter_ < 0) {
bytes_counter_ += RandomPeriod();
db_->RecordReadSample(k);
}
if (!ParseInternalKey(k, ikey)) {
status_ = Status::Corruption("corrupted internal key in DBIter");
return false;
} else {
@@ -144,12 +161,13 @@ void DBIter::Next() {
saved_key_.clear();
return;
}
// saved_key_ already contains the key to skip past.
} else {
// Store in saved_key_ the current key so we skip it below.
SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
}
// Temporarily use saved_key_ as storage for key to skip.
std::string* skip = &saved_key_;
SaveKey(ExtractUserKey(iter_->key()), skip);
FindNextUserEntry(true, skip);
FindNextUserEntry(true, &saved_key_);
}
void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
@@ -288,12 +306,12 @@ void DBIter::SeekToLast() {
} // anonymous namespace
Iterator* NewDBIterator(
const std::string* dbname,
Env* env,
DBImpl* db,
const Comparator* user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence) {
return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
SequenceNumber sequence,
uint32_t seed) {
return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
}
} // namespace leveldb

View File

@@ -11,15 +11,17 @@
namespace leveldb {
class DBImpl;
// Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number
// into appropriate user keys.
extern Iterator* NewDBIterator(
const std::string* dbname,
Env* env,
DBImpl* db,
const Comparator* user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence);
SequenceNumber sequence,
uint32_t seed);
} // namespace leveldb

View File

@@ -147,7 +147,7 @@ class SpecialEnv : public EnvWrapper {
Status s = target()->NewWritableFile(f, r);
if (s.ok()) {
if (strstr(f.c_str(), ".sst") != NULL) {
if (strstr(f.c_str(), ".ldb") != NULL) {
*r = new SSTableFile(this, *r);
} else if (strstr(f.c_str(), "MANIFEST") != NULL) {
*r = new ManifestFile(this, *r);
@@ -484,6 +484,24 @@ class DBTest {
}
return false;
}
// Returns number of files renamed.
int RenameLDBToSST() {
std::vector<std::string> filenames;
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
uint64_t number;
FileType type;
int files_renamed = 0;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
const std::string from = TableFileName(dbname_, number);
const std::string to = SSTTableFileName(dbname_, number);
ASSERT_OK(env_->RenameFile(from, to));
files_renamed++;
}
}
return files_renamed;
}
};
TEST(DBTest, Empty) {
@@ -1632,6 +1650,22 @@ TEST(DBTest, MissingSSTFile) {
<< s.ToString();
}
TEST(DBTest, StillReadSST) {
ASSERT_OK(Put("foo", "bar"));
ASSERT_EQ("bar", Get("foo"));
// Dump the memtable to disk.
dbfull()->TEST_CompactMemTable();
ASSERT_EQ("bar", Get("foo"));
Close();
ASSERT_GT(RenameLDBToSST(), 0);
Options options = CurrentOptions();
options.paranoid_checks = true;
Status s = TryReopen(&options);
ASSERT_TRUE(s.ok());
ASSERT_EQ("bar", Get("foo"));
}
TEST(DBTest, FilesDeletedAfterCompaction) {
ASSERT_OK(Put("foo", "v2"));
Compact("a", "z");

View File

@@ -38,6 +38,9 @@ static const int kL0_StopWritesTrigger = 12;
// space if the same key space is being repeatedly overwritten.
static const int kMaxMemCompactLevel = 2;
// Approximate gap in bytes between samples of data read during iteration.
static const int kReadBytesPeriod = 1048576;
} // namespace config
class InternalKey;

View File

@@ -30,6 +30,11 @@ std::string LogFileName(const std::string& name, uint64_t number) {
}
std::string TableFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "ldb");
}
std::string SSTTableFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "sst");
}
@@ -71,7 +76,7 @@ std::string OldInfoLogFileName(const std::string& dbname) {
// dbname/LOG
// dbname/LOG.old
// dbname/MANIFEST-[0-9]+
// dbname/[0-9]+.(log|sst)
// dbname/[0-9]+.(log|sst|ldb)
bool ParseFileName(const std::string& fname,
uint64_t* number,
FileType* type) {
@@ -106,7 +111,7 @@ bool ParseFileName(const std::string& fname,
Slice suffix = rest;
if (suffix == Slice(".log")) {
*type = kLogFile;
} else if (suffix == Slice(".sst")) {
} else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) {
*type = kTableFile;
} else if (suffix == Slice(".dbtmp")) {
*type = kTempFile;

View File

@@ -37,6 +37,11 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number);
// "dbname".
extern std::string TableFileName(const std::string& dbname, uint64_t number);
// Return the legacy file name for an sstable with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string SSTTableFileName(const std::string& dbname, uint64_t number);
// Return the name of the descriptor file for the db named by
// "dbname" and the specified incarnation number. The result will be
// prefixed with "dbname".

View File

@@ -27,6 +27,7 @@ TEST(FileNameTest, Parse) {
{ "100.log", 100, kLogFile },
{ "0.log", 0, kLogFile },
{ "0.sst", 0, kTableFile },
{ "0.ldb", 0, kTableFile },
{ "CURRENT", 0, kCurrentFile },
{ "LOCK", 0, kDBLockFile },
{ "MANIFEST-2", 2, kDescriptorFile },

View File

@@ -263,6 +263,12 @@ class Repairer {
std::string fname = TableFileName(dbname_, t->meta.number);
int counter = 0;
Status status = env_->GetFileSize(fname, &t->meta.file_size);
if (!status.ok()) {
fname = SSTTableFileName(dbname_, t->meta.number);
Status s2 = env_->GetFileSize(fname, &t->meta.file_size);
if (s2.ok())
status = Status::OK();
}
if (status.ok()) {
Iterator* iter = table_cache_->NewIterator(
ReadOptions(), t->meta.number, t->meta.file_size);
@@ -293,6 +299,8 @@ class Repairer {
}
delete iter;
}
// If there was trouble opening an .sst file this will report that the .ldb
// file was not found, which is kind of lame but shouldn't happen often.
Log(options_.info_log, "Table #%llu: %d entries %s",
(unsigned long long) t->meta.number,
counter,

View File

@@ -54,6 +54,12 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
RandomAccessFile* file = NULL;
Table* table = NULL;
s = env_->NewRandomAccessFile(fname, &file);
if (!s.ok()) {
std::string old_fname = SSTTableFileName(dbname_, file_number);
if (env_->NewRandomAccessFile(old_fname, &file).ok()) {
s = Status::OK();
}
}
if (s.ok()) {
s = Table::Open(*options_, file, file_size, &table);
}

View File

@@ -289,6 +289,51 @@ static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
return a->number > b->number;
}
void Version::ForEachOverlapping(Slice user_key, Slice internal_key,
void* arg,
bool (*func)(void*, int, FileMetaData*)) {
// TODO(sanjay): Change Version::Get() to use this function.
const Comparator* ucmp = vset_->icmp_.user_comparator();
// Search level-0 in order from newest to oldest.
std::vector<FileMetaData*> tmp;
tmp.reserve(files_[0].size());
for (uint32_t i = 0; i < files_[0].size(); i++) {
FileMetaData* f = files_[0][i];
if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
tmp.push_back(f);
}
}
if (!tmp.empty()) {
std::sort(tmp.begin(), tmp.end(), NewestFirst);
for (uint32_t i = 0; i < tmp.size(); i++) {
if (!(*func)(arg, 0, tmp[i])) {
return;
}
}
}
// Search other levels.
for (int level = 1; level < config::kNumLevels; level++) {
size_t num_files = files_[level].size();
if (num_files == 0) continue;
// Binary search to find earliest index whose largest key >= internal_key.
uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
if (index < num_files) {
FileMetaData* f = files_[level][index];
if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
// All of "f" is past any data for user_key
} else {
if (!(*func)(arg, level, f)) {
return;
}
}
}
}
}
Status Version::Get(const ReadOptions& options,
const LookupKey& k,
std::string* value,
@@ -401,6 +446,44 @@ bool Version::UpdateStats(const GetStats& stats) {
return false;
}
bool Version::RecordReadSample(Slice internal_key) {
ParsedInternalKey ikey;
if (!ParseInternalKey(internal_key, &ikey)) {
return false;
}
struct State {
GetStats stats; // Holds first matching file
int matches;
static bool Match(void* arg, int level, FileMetaData* f) {
State* state = reinterpret_cast<State*>(arg);
state->matches++;
if (state->matches == 1) {
// Remember first match.
state->stats.seek_file = f;
state->stats.seek_file_level = level;
}
// We can stop iterating once we have a second match.
return state->matches < 2;
}
};
State state;
state.matches = 0;
ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
// Must have at least two matches since we want to merge across
// files. But what if we have a single file that contains many
// overwrites and deletions? Should we have another mechanism for
// finding such files?
if (state.matches >= 2) {
// 1MB cost is about 1 seek (see comment in Builder::Apply).
return UpdateStats(state.stats);
}
return false;
}
void Version::Ref() {
++refs_;
}
@@ -435,10 +518,13 @@ int Version::PickLevelForMemTableOutput(
if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
break;
}
GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
const int64_t sum = TotalFileSize(overlaps);
if (sum > kMaxGrandParentOverlapBytes) {
break;
if (level + 2 < config::kNumLevels) {
// Check that file does not overlap too many grandparent bytes.
GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
const int64_t sum = TotalFileSize(overlaps);
if (sum > kMaxGrandParentOverlapBytes) {
break;
}
}
level++;
}
@@ -452,6 +538,8 @@ void Version::GetOverlappingInputs(
const InternalKey* begin,
const InternalKey* end,
std::vector<FileMetaData*>* inputs) {
assert(level >= 0);
assert(level < config::kNumLevels);
inputs->clear();
Slice user_begin, user_end;
if (begin != NULL) {

View File

@@ -78,6 +78,12 @@ class Version {
// REQUIRES: lock is held
bool UpdateStats(const GetStats& stats);
// Record a sample of bytes read at the specified internal key.
// Samples are taken approximately once every config::kReadBytesPeriod
// bytes. Returns true if a new compaction may need to be triggered.
// REQUIRES: lock is held
bool RecordReadSample(Slice key);
// Reference count management (so Versions do not disappear out from
// under live iterators)
void Ref();
@@ -114,6 +120,15 @@ class Version {
class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
// Call func(arg, level, f) for every file that overlaps user_key in
// order from newest to oldest. If an invocation of func returns
// false, makes no more calls.
//
// REQUIRES: user portion of internal_key == user_key.
void ForEachOverlapping(Slice user_key, Slice internal_key,
void* arg,
bool (*func)(void*, int, FileMetaData*));
VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list

View File

@@ -14,7 +14,7 @@ namespace leveldb {
// Update Makefile if you change these
static const int kMajorVersion = 1;
static const int kMinorVersion = 12;
static const int kMinorVersion = 14;
struct Options;
struct ReadOptions;

View File

@@ -40,7 +40,7 @@ char* Arena::AllocateFallback(size_t bytes) {
}
char* Arena::AllocateAligned(size_t bytes) {
const int align = sizeof(void*); // We'll align to pointer size
const int align = (sizeof(void*) > 8) ? sizeof(void*) : 8;
assert((align & (align-1)) == 0); // Pointer size should be a power of 2
size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1);
size_t slop = (current_mod == 0 ? 0 : align - current_mod);

View File

@@ -320,8 +320,39 @@ class PosixMmapFile : public WritableFile {
return Status::OK();
}
virtual Status Sync() {
Status SyncDirIfManifest() {
const char* f = filename_.c_str();
const char* sep = strrchr(f, '/');
Slice basename;
std::string dir;
if (sep == NULL) {
dir = ".";
basename = f;
} else {
dir = std::string(f, sep - f);
basename = sep + 1;
}
Status s;
if (basename.starts_with("MANIFEST")) {
int fd = open(dir.c_str(), O_RDONLY);
if (fd < 0) {
s = IOError(dir, errno);
} else {
if (fsync(fd) < 0) {
s = IOError(dir, errno);
}
close(fd);
}
}
return s;
}
virtual Status Sync() {
// Ensure new files referred to by the manifest are in the filesystem.
Status s = SyncDirIfManifest();
if (!s.ok()) {
return s;
}
if (pending_sync_) {
// Some unmapped data was not synced

View File

@@ -16,7 +16,12 @@ class Random {
private:
uint32_t seed_;
public:
explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { }
explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) {
// Avoid bad seeds.
if (seed_ == 0 || seed_ == 2147483647L) {
seed_ = 1;
}
}
uint32_t Next() {
static const uint32_t M = 2147483647L; // 2^31-1
static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0