Initial checkin.

git-svn-id: https://leveldb.googlecode.com/svn/trunk@2 62dab493-f737-651d-591e-8d6aee1b9529
This commit is contained in:
jorlow@chromium.org
2011-03-18 22:37:00 +00:00
parent 54f1fd7eef
commit f67e15e50f
118 changed files with 19180 additions and 0 deletions

97
db/builder.cc Normal file
View File

@@ -0,0 +1,97 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/builder.h"
#include "db/filename.h"
#include "db/dbformat.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "include/db.h"
#include "include/env.h"
#include "include/iterator.h"
namespace leveldb {
Status BuildTable(const std::string& dbname,
Env* env,
const Options& options,
TableCache* table_cache,
Iterator* iter,
FileMetaData* meta,
VersionEdit* edit) {
Status s;
meta->file_size = 0;
iter->SeekToFirst();
std::string fname = TableFileName(dbname, meta->number);
if (iter->Valid()) {
WritableFile* file;
s = env->NewWritableFile(fname, &file);
if (!s.ok()) {
return s;
}
TableBuilder* builder = new TableBuilder(options, file);
meta->smallest.DecodeFrom(iter->key());
for (; iter->Valid(); iter->Next()) {
Slice key = iter->key();
meta->largest.DecodeFrom(key);
if (ExtractValueType(key) == kTypeLargeValueRef) {
if (iter->value().size() != LargeValueRef::ByteSize()) {
s = Status::Corruption("invalid indirect reference hash value (L0)");
break;
}
edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
meta->number,
iter->key());
}
builder->Add(key, iter->value());
}
// Finish and check for builder errors
if (s.ok()) {
s = builder->Finish();
if (s.ok()) {
meta->file_size = builder->FileSize();
assert(meta->file_size > 0);
}
} else {
builder->Abandon();
}
delete builder;
// Finish and check for file errors
if (s.ok()) {
s = file->Sync();
}
if (s.ok()) {
s = file->Close();
}
delete file;
file = NULL;
if (s.ok()) {
// Verify that the table is usable
Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number);
s = it->status();
delete it;
}
}
// Check for input iterator errors
if (!iter->status().ok()) {
s = iter->status();
}
if (s.ok() && meta->file_size > 0) {
edit->AddFile(0, meta->number, meta->file_size,
meta->smallest, meta->largest);
} else {
env->DeleteFile(fname);
}
return s;
}
}

36
db/builder.h Normal file
View File

@@ -0,0 +1,36 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_BUILDER_H_
#define STORAGE_LEVELDB_DB_BUILDER_H_
#include "include/status.h"
namespace leveldb {
struct Options;
struct FileMetaData;
class Env;
class Iterator;
class TableCache;
class VersionEdit;
// Build a Table file from the contents of *iter. The generated file
// will be named according to meta->number. On success, the rest of
// *meta will be filled with metadata about the generated table, and
// large value refs and the added file information will be added to
// *edit. If no data is present in *iter, meta->file_size will be set
// to zero, and no Table file will be produced.
extern Status BuildTable(const std::string& dbname,
Env* env,
const Options& options,
TableCache* table_cache,
Iterator* iter,
FileMetaData* meta,
VersionEdit* edit);
}
#endif // STORAGE_LEVELDB_DB_BUILDER_H_

366
db/corruption_test.cc Normal file
View File

@@ -0,0 +1,366 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/db.h"
#include <errno.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "include/env.h"
#include "include/table.h"
#include "include/write_batch.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
static const int kValueSize = 1000;
class CorruptionTest {
public:
test::ErrorEnv env_;
Random rnd_;
std::string dbname_;
Options options_;
DB* db_;
CorruptionTest() : rnd_(test::RandomSeed()) {
options_.env = &env_;
dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, options_);
db_ = NULL;
options_.create_if_missing = true;
Reopen();
options_.create_if_missing = false;
}
~CorruptionTest() {
delete db_;
DestroyDB(dbname_, Options());
}
Status TryReopen(Options* options = NULL) {
delete db_;
db_ = NULL;
Options opt = (options ? *options : options_);
opt.env = &env_;
return DB::Open(opt, dbname_, &db_);
}
void Reopen(Options* options = NULL) {
ASSERT_OK(TryReopen(options));
}
void RepairDB() {
delete db_;
db_ = NULL;
ASSERT_OK(::leveldb::RepairDB(dbname_, options_));
}
void Build(int n) {
std::string key_space, value_space;
WriteBatch batch;
for (int i = 0; i < n; i++) {
//if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
Slice key = Key(i, &key_space);
batch.Clear();
batch.Put(key, Value(i, &value_space));
ASSERT_OK(db_->Write(WriteOptions(), &batch));
}
}
void Check(int min_expected, int max_expected) {
int next_expected = 0;
int missed = 0;
int bad_keys = 0;
int bad_values = 0;
int correct = 0;
std::string value_space;
Iterator* iter = db_->NewIterator(ReadOptions());
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
uint64_t key;
Slice in(iter->key());
if (!ConsumeDecimalNumber(&in, &key) ||
!in.empty() ||
key < next_expected) {
bad_keys++;
continue;
}
missed += (key - next_expected);
next_expected = key + 1;
if (iter->value() != Value(key, &value_space)) {
bad_values++;
} else {
correct++;
}
}
delete iter;
fprintf(stderr,
"expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
min_expected, max_expected, correct, bad_keys, bad_values, missed);
ASSERT_LE(min_expected, correct);
ASSERT_GE(max_expected, correct);
}
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
// Pick file to corrupt
std::vector<std::string> filenames;
ASSERT_OK(env_.GetChildren(dbname_, &filenames));
uint64_t number;
LargeValueRef large_ref;
FileType type;
std::vector<std::string> candidates;
for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &large_ref, &type) &&
type == filetype) {
candidates.push_back(dbname_ + "/" + filenames[i]);
}
}
ASSERT_TRUE(!candidates.empty()) << filetype;
std::string fname = candidates[rnd_.Uniform(candidates.size())];
struct stat sbuf;
if (stat(fname.c_str(), &sbuf) != 0) {
const char* msg = strerror(errno);
ASSERT_TRUE(false) << fname << ": " << msg;
}
if (offset < 0) {
// Relative to end of file; make it absolute
if (-offset > sbuf.st_size) {
offset = 0;
} else {
offset = sbuf.st_size + offset;
}
}
if (offset > sbuf.st_size) {
offset = sbuf.st_size;
}
if (offset + bytes_to_corrupt > sbuf.st_size) {
bytes_to_corrupt = sbuf.st_size - offset;
}
// Do it
std::string contents;
Status s = ReadFileToString(Env::Default(), fname, &contents);
ASSERT_TRUE(s.ok()) << s.ToString();
for (int i = 0; i < bytes_to_corrupt; i++) {
contents[i + offset] ^= 0x80;
}
s = WriteStringToFile(Env::Default(), contents, fname);
ASSERT_TRUE(s.ok()) << s.ToString();
}
uint64_t Property(const std::string& name) {
uint64_t result;
if (!db_->GetProperty(name, &result)) {
result = ~static_cast<uint64_t>(0);
}
return result;
}
// Return the ith key
Slice Key(int i, std::string* storage) {
char buf[100];
snprintf(buf, sizeof(buf), "%016d", i);
storage->assign(buf, strlen(buf));
return Slice(*storage);
}
// Return the value to associate with the specified key
Slice Value(int k, std::string* storage) {
Random r(k);
return test::RandomString(&r, kValueSize, storage);
}
};
TEST(CorruptionTest, Recovery) {
Build(10);
Check(10, 10);
Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
Corrupt(kLogFile, 2*kValueSize, 1); // Somewhere in second log record?
Reopen();
Check(8, 8);
}
TEST(CorruptionTest, RecoverWriteError) {
env_.writable_file_error_ = true;
Status s = TryReopen();
ASSERT_TRUE(!s.ok());
}
TEST(CorruptionTest, NewFileErrorDuringWrite) {
// Do enough writing to force minor compaction
env_.writable_file_error_ = true;
const int num = 3 + (Options().write_buffer_size / kValueSize);
std::string value_storage;
Status s;
for (int i = 0; s.ok() && i < num; i++) {
WriteBatch batch;
batch.Put("a", Value(100, &value_storage));
s = db_->Write(WriteOptions(), &batch);
}
ASSERT_TRUE(!s.ok());
ASSERT_GE(env_.num_writable_file_errors_, 1);
env_.writable_file_error_ = false;
Reopen();
}
TEST(CorruptionTest, TableFile) {
Build(100);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, "", "~");
dbi->TEST_CompactRange(1, "", "~");
Corrupt(kTableFile, 100, 1);
Check(99, 99);
}
TEST(CorruptionTest, TableFileIndexData) {
Build(10000); // Enough to build multiple Tables
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, "", "~");
dbi->TEST_CompactRange(1, "", "~");
Corrupt(kTableFile, -1000, 500);
Reopen();
Check(5000, 9999);
}
TEST(CorruptionTest, MissingDescriptor) {
Build(1000);
RepairDB();
Reopen();
Check(1000, 1000);
}
TEST(CorruptionTest, SequenceNumberRecovery) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
RepairDB();
Reopen();
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v5", v);
// Write something. If sequence number was not recovered properly,
// it will be hidden by an earlier write.
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v6", v);
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v6", v);
}
TEST(CorruptionTest, LargeValueRecovery) {
Options options;
options.large_value_threshold = 10000;
Reopen(&options);
Random rnd(301);
std::string big;
ASSERT_OK(db_->Put(WriteOptions(),
"foo", test::RandomString(&rnd, 100000, &big)));
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ(big, v);
RepairDB();
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ(big, v);
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ(big, v);
}
TEST(CorruptionTest, CorruptedDescriptor) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, "", "~");
Corrupt(kDescriptorFile, 0, 1000);
Status s = TryReopen();
ASSERT_TRUE(!s.ok());
RepairDB();
Reopen();
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("hello", v);
}
TEST(CorruptionTest, CompactionInputError) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
Corrupt(kTableFile, 100, 1);
Check(9, 9);
// Force compactions by writing lots of values
Build(10000);
Check(10000, 10000);
dbi->TEST_CompactRange(0, "", "~");
ASSERT_EQ(0, Property("leveldb.num-files-at-level0"));
}
TEST(CorruptionTest, CompactionInputErrorParanoid) {
Options options;
options.paranoid_checks = true;
Reopen(&options);
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
Corrupt(kTableFile, 100, 1);
Check(9, 9);
// Write must eventually fail because of corrupted table
Status s;
std::string tmp1, tmp2;
for (int i = 0; i < 10000 && s.ok(); i++) {
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
}
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
}
TEST(CorruptionTest, UnrelatedKeys) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
Corrupt(kTableFile, 100, 1);
std::string tmp1, tmp2;
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
dbi->TEST_CompactMemTable();
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

376
db/db_bench.cc Normal file
View File

@@ -0,0 +1,376 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include "db/db_impl.h"
#include "db/version_set.h"
#include "include/cache.h"
#include "include/db.h"
#include "include/env.h"
#include "include/write_batch.h"
#include "util/histogram.h"
#include "util/random.h"
#include "util/testutil.h"
// Comma-separated list of operations to run in the specified order
// Actual benchmarks:
// writeseq -- write N values in sequential key order
// writerandom -- write N values in random key order
// writebig -- write N/1000 100K valuesin random order
// readseq -- read N values sequentially
// readrandom -- read N values in random order
// Meta operations:
// compact -- Compact the entire DB
// heapprofile -- Dump a heap profile (if supported by this port)
// sync -- switch to synchronous writes (not the default)
// nosync -- switch to asynchronous writes (the default)
// tenth -- divide N by 10 (i.e., following benchmarks are smaller)
// normal -- reset N back to its normal value (1000000)
static const char* FLAGS_benchmarks =
"writeseq,"
"writeseq,"
"writerandom,"
"sync,tenth,tenth,writerandom,nosync,normal,"
"readseq,"
"readrandom,"
"compact,"
"readseq,"
"readrandom,"
"writebig";
// Number of key/values to place in database
static int FLAGS_num = 1000000;
// Size of each value
static int FLAGS_value_size = 100;
// Arrange to generate values that shrink to this fraction of
// their original size after compression
static double FLAGS_compression_ratio = 0.25;
// Print histogram of operation timings
static bool FLAGS_histogram = false;
// Number of bytes to buffer in memtable before compacting
static int FLAGS_write_buffer_size = 1 << 20;
namespace leveldb {
// Helper for quickly generating random data.
namespace {
class RandomGenerator {
private:
std::string data_;
int pos_;
public:
RandomGenerator() {
// We use a limited amount of data over and over again and ensure
// that it is larger than the compression window (32KB), and also
// large enough to serve all typical value sizes we want to write.
Random rnd(301);
std::string piece;
while (data_.size() < 1048576) {
// Add a short fragment that is as compressible as specified
// by FLAGS_compression_ratio.
test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
data_.append(piece);
}
pos_ = 0;
}
Slice Generate(int len) {
if (pos_ + len > data_.size()) {
pos_ = 0;
assert(len < data_.size());
}
pos_ += len;
return Slice(data_.data() + pos_ - len, len);
}
};
}
class Benchmark {
private:
Cache* cache_;
DB* db_;
int num_;
bool sync_;
int heap_counter_;
double start_;
double last_op_finish_;
int64_t bytes_;
std::string message_;
Histogram hist_;
RandomGenerator gen_;
Random rand_;
// State kept for progress messages
int done_;
int next_report_; // When to report next
void Start() {
start_ = Env::Default()->NowMicros() * 1e-6;
bytes_ = 0;
message_.clear();
last_op_finish_ = start_;
hist_.Clear();
done_ = 0;
next_report_ = 100;
}
void FinishedSingleOp() {
if (FLAGS_histogram) {
double now = Env::Default()->NowMicros() * 1e-6;
double micros = (now - last_op_finish_) * 1e6;
hist_.Add(micros);
if (micros > 20000) {
fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
fflush(stderr);
}
last_op_finish_ = now;
}
done_++;
if (done_ >= next_report_) {
if (next_report_ < 1000) {
next_report_ += 100;
} else if (next_report_ < 10000) {
next_report_ += 1000;
} else if (next_report_ < 100000) {
next_report_ += 10000;
} else {
next_report_ += 100000;
}
fprintf(stderr, "... finished %d ops%30s\r", done_, "");
fflush(stderr);
}
}
void Stop(const Slice& name) {
double finish = Env::Default()->NowMicros() * 1e-6;
// Pretend at least one op was done in case we are running a benchmark
// that does nto call FinishedSingleOp().
if (done_ < 1) done_ = 1;
if (bytes_ > 0) {
char rate[100];
snprintf(rate, sizeof(rate), "%5.1f MB/s",
(bytes_ / 1048576.0) / (finish - start_));
if (!message_.empty()) {
message_.push_back(' ');
}
message_.append(rate);
}
fprintf(stdout, "%-12s : %10.3f micros/op;%s%s\n",
name.ToString().c_str(),
(finish - start_) * 1e6 / done_,
(message_.empty() ? "" : " "),
message_.c_str());
if (FLAGS_histogram) {
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
}
fflush(stdout);
}
public:
enum Order { SEQUENTIAL, RANDOM };
Benchmark() : cache_(NewLRUCache(200<<20)),
db_(NULL),
num_(FLAGS_num),
sync_(false),
heap_counter_(0),
bytes_(0),
rand_(301) {
std::vector<std::string> files;
Env::Default()->GetChildren("/tmp/dbbench", &files);
for (int i = 0; i < files.size(); i++) {
if (Slice(files[i]).starts_with("heap-")) {
Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]);
}
}
DestroyDB("/tmp/dbbench", Options());
}
~Benchmark() {
delete db_;
delete cache_;
}
void Run() {
Options options;
options.create_if_missing = true;
options.max_open_files = 10000;
options.block_cache = cache_;
options.write_buffer_size = FLAGS_write_buffer_size;
Start();
Status s = DB::Open(options, "/tmp/dbbench", &db_);
Stop("open");
if (!s.ok()) {
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
exit(1);
}
const char* benchmarks = FLAGS_benchmarks;
while (benchmarks != NULL) {
const char* sep = strchr(benchmarks, ',');
Slice name;
if (sep == NULL) {
name = benchmarks;
benchmarks = NULL;
} else {
name = Slice(benchmarks, sep - benchmarks);
benchmarks = sep + 1;
}
Start();
if (name == Slice("writeseq")) {
Write(SEQUENTIAL, num_, FLAGS_value_size);
} else if (name == Slice("writerandom")) {
Write(RANDOM, num_, FLAGS_value_size);
} else if (name == Slice("writebig")) {
Write(RANDOM, num_ / 1000, 100 * 1000);
} else if (name == Slice("readseq")) {
Read(SEQUENTIAL);
} else if (name == Slice("readrandom")) {
Read(RANDOM);
} else if (name == Slice("compact")) {
Compact();
} else if (name == Slice("heapprofile")) {
HeapProfile();
} else if (name == Slice("sync")) {
sync_ = true;
} else if (name == Slice("nosync")) {
sync_ = false;
} else if (name == Slice("tenth")) {
num_ = num_ / 10;
} else if (name == Slice("normal")) {
num_ = FLAGS_num;
} else {
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
}
Stop(name);
}
}
void Write(Order order, int num_entries, int value_size) {
WriteBatch batch;
Status s;
std::string val;
WriteOptions options;
options.sync = sync_;
for (int i = 0; i < num_entries; i++) {
const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
char key[100];
snprintf(key, sizeof(key), "%012d", k);
batch.Clear();
batch.Put(key, gen_.Generate(value_size));
s = db_->Write(options, &batch);
bytes_ += value_size + strlen(key);
if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1);
}
FinishedSingleOp();
}
}
void Read(Order order) {
ReadOptions options;
if (order == SEQUENTIAL) {
Iterator* iter = db_->NewIterator(options);
int i = 0;
for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
bytes_ += iter->key().size() + iter->value().size();
FinishedSingleOp();
++i;
}
delete iter;
} else {
std::string value;
for (int i = 0; i < num_; i++) {
char key[100];
const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
snprintf(key, sizeof(key), "%012d", k);
db_->Get(options, key, &value);
FinishedSingleOp();
}
}
}
void Compact() {
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
int max_level_with_files = 1;
for (int level = 1; level < config::kNumLevels; level++) {
uint64_t v;
char name[100];
snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
if (db_->GetProperty(name, &v) && v > 0) {
max_level_with_files = level;
}
}
for (int level = 0; level < max_level_with_files; level++) {
dbi->TEST_CompactRange(level, "", "~");
}
}
static void WriteToFile(void* arg, const char* buf, int n) {
reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
}
void HeapProfile() {
char fname[100];
snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_);
WritableFile* file;
Status s = Env::Default()->NewWritableFile(fname, &file);
if (!s.ok()) {
message_ = s.ToString();
return;
}
bool ok = port::GetHeapProfile(WriteToFile, file);
delete file;
if (!ok) {
message_ = "not supported";
Env::Default()->DeleteFile(fname);
}
}
};
}
int main(int argc, char** argv) {
for (int i = 1; i < argc; i++) {
double d;
int n;
char junk;
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
FLAGS_compression_ratio = d;
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_histogram = n;
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
FLAGS_num = n;
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
FLAGS_value_size = n;
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
FLAGS_write_buffer_size = n;
} else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1);
}
}
leveldb::Benchmark benchmark;
benchmark.Run();
return 0;
}

1195
db/db_impl.cc Normal file

File diff suppressed because it is too large Load Diff

192
db/db_impl.h Normal file
View File

@@ -0,0 +1,192 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_
#define STORAGE_LEVELDB_DB_DB_IMPL_H_
#include <set>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "include/db.h"
#include "include/env.h"
#include "port/port.h"
namespace leveldb {
class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;
class DBImpl : public DB {
public:
DBImpl(const Options& options, const std::string& dbname);
virtual ~DBImpl();
// Implementations of the DB interface
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
virtual Status Delete(const WriteOptions&, const Slice& key);
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
virtual Status Get(const ReadOptions& options,
const Slice& key,
std::string* value);
virtual Iterator* NewIterator(const ReadOptions&);
virtual const Snapshot* GetSnapshot();
virtual void ReleaseSnapshot(const Snapshot* snapshot);
virtual bool GetProperty(const Slice& property, uint64_t* value);
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
// Extra methods (for testing) that are not in the public DB interface
// Compact any files in the named level that overlap [begin,end]
void TEST_CompactRange(
int level,
const std::string& begin,
const std::string& end);
// Force current memtable contents to be compacted.
Status TEST_CompactMemTable();
// Return an internal iterator over the current state of the database.
// The keys of this iterator are internal keys (see format.h).
// The returned iterator should be deleted when no longer needed.
Iterator* TEST_NewInternalIterator();
private:
friend class DB;
Iterator* NewInternalIterator(const ReadOptions&,
SequenceNumber* latest_snapshot);
Status NewDB();
// Recover the descriptor from persistent storage. May do a significant
// amount of work to recover recently logged updates. Any changes to
// be made to the descriptor are added to *edit.
Status Recover(VersionEdit* edit);
// Apply the specified updates and save the resulting descriptor to
// persistent storage. If cleanup_mem is non-NULL, arrange to
// delete it when all existing snapshots have gone away iff Install()
// returns OK.
Status Install(VersionEdit* edit,
uint64_t new_log_number,
MemTable* cleanup_mem);
void MaybeIgnoreError(Status* s) const;
// Delete any unneeded files and stale in-memory entries.
void DeleteObsoleteFiles();
// Called when an iterator over a particular version of the
// descriptor goes away.
static void Unref(void* arg1, void* arg2);
// Compact the in-memory write buffer to disk. Switches to a new
// log-file/memtable and writes a new descriptor iff successful.
Status CompactMemTable();
Status RecoverLogFile(uint64_t log_number,
VersionEdit* edit,
SequenceNumber* max_sequence);
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit);
bool HasLargeValues(const WriteBatch& batch) const;
// Process data in "*updates" and return a status. "assigned_seq"
// is the sequence number assigned to the first mod in "*updates".
// If no large values are encountered, "*final" is set to "updates".
// If large values were encountered, registers the references of the
// large values with the VersionSet, writes the large values to
// files (if appropriate), and allocates a new WriteBatch with the
// large values replaced with indirect references and stores a
// pointer to the new WriteBatch in *final. If *final != updates on
// return, then the client should delete *final when no longer
// needed. Returns OK on success, and an appropriate error
// otherwise.
Status HandleLargeValues(SequenceNumber assigned_seq,
WriteBatch* updates,
WriteBatch** final);
// Helper routine for HandleLargeValues
void MaybeCompressLargeValue(
const Slice& raw_value,
Slice* file_bytes,
std::string* scratch,
LargeValueRef* ref);
struct CompactionState;
void MaybeScheduleCompaction();
static void BGWork(void* db);
void BackgroundCall();
void BackgroundCompaction();
void CleanupCompaction(CompactionState* compact);
Status DoCompactionWork(CompactionState* compact);
Status OpenCompactionOutputFile(CompactionState* compact);
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
Status InstallCompactionResults(CompactionState* compact);
// Constant after construction
Env* const env_;
const InternalKeyComparator internal_comparator_;
const Options options_; // options_.comparator == &internal_comparator_
bool owns_info_log_;
const std::string dbname_;
// table_cache_ provides its own synchronization
TableCache* table_cache_;
// Lock over the persistent DB state. Non-NULL iff successfully acquired.
FileLock* db_lock_;
// State below is protected by mutex_
port::Mutex mutex_;
port::AtomicPointer shutting_down_;
port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_
port::CondVar compacting_cv_; // Signalled when !compacting_
SequenceNumber last_sequence_;
MemTable* mem_;
WritableFile* logfile_;
log::Writer* log_;
uint64_t log_number_;
SnapshotList snapshots_;
// Set of table files to protect from deletion because they are
// part of ongoing compactions.
std::set<uint64_t> pending_outputs_;
// Has a background compaction been scheduled or is running?
bool bg_compaction_scheduled_;
// Is there a compaction running?
bool compacting_;
VersionSet* versions_;
// Have we encountered a background error in paranoid mode?
Status bg_error_;
// No copying allowed
DBImpl(const DBImpl&);
void operator=(const DBImpl&);
const Comparator* user_comparator() const {
return internal_comparator_.user_comparator();
}
};
// Sanitize db options. The caller should delete result.info_log if
// it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db,
const InternalKeyComparator* icmp,
const Options& src);
}
#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_

412
db/db_iter.cc Normal file
View File

@@ -0,0 +1,412 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_iter.h"
#include "db/filename.h"
#include "db/dbformat.h"
#include "include/env.h"
#include "include/iterator.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/mutexlock.h"
namespace leveldb {
#if 0
static void DumpInternalIter(Iterator* iter) {
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey k;
if (!ParseInternalKey(iter->key(), &k)) {
fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
} else {
fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
}
}
}
#endif
namespace {
// Memtables and sstables that make the DB representation contain
// (userkey,seq,type) => uservalue entries. DBIter
// combines multiple entries for the same userkey found in the DB
// representation into a single entry while accounting for sequence
// numbers, deletion markers, overwrites, etc.
class DBIter: public Iterator {
public:
DBIter(const std::string* dbname, Env* env,
const Comparator* cmp, Iterator* iter, SequenceNumber s)
: dbname_(dbname),
env_(env),
user_comparator_(cmp),
iter_(iter),
sequence_(s),
large_(NULL),
valid_(false) {
}
virtual ~DBIter() {
delete iter_;
delete large_;
}
virtual bool Valid() const { return valid_; }
virtual Slice key() const {
assert(valid_);
return key_;
}
virtual Slice value() const {
assert(valid_);
if (large_ == NULL) {
return value_;
} else {
MutexLock l(&large_->mutex);
if (!large_->produced) {
ReadIndirectValue();
}
return large_->value;
}
}
virtual void Next() {
assert(valid_);
// iter_ is already positioned past DBIter::key()
FindNextUserEntry();
}
virtual void Prev() {
assert(valid_);
bool ignored;
ScanUntilBeforeCurrentKey(&ignored);
FindPrevUserEntry();
}
virtual void Seek(const Slice& target) {
ParsedInternalKey ikey(target, sequence_, kValueTypeForSeek);
std::string tmp;
AppendInternalKey(&tmp, ikey);
iter_->Seek(tmp);
FindNextUserEntry();
}
virtual void SeekToFirst() {
iter_->SeekToFirst();
FindNextUserEntry();
}
virtual void SeekToLast();
virtual Status status() const {
if (status_.ok()) {
if (large_ != NULL && !large_->status.ok()) return large_->status;
return iter_->status();
} else {
return status_;
}
}
private:
void FindNextUserEntry();
void FindPrevUserEntry();
void SaveKey(const Slice& k) { key_.assign(k.data(), k.size()); }
void SaveValue(const Slice& v) {
if (value_.capacity() > v.size() + 1048576) {
std::string empty;
swap(empty, value_);
}
value_.assign(v.data(), v.size());
}
bool ParseKey(ParsedInternalKey* key);
void SkipPast(const Slice& k);
void ScanUntilBeforeCurrentKey(bool* found_live);
void ReadIndirectValue() const;
struct Large {
port::Mutex mutex;
std::string value;
bool produced;
Status status;
};
const std::string* const dbname_;
Env* const env_;
const Comparator* const user_comparator_;
// iter_ is positioned just past current entry for DBIter if valid_
Iterator* const iter_;
SequenceNumber const sequence_;
Status status_;
std::string key_; // Always a user key
std::string value_;
Large* large_; // Non-NULL if value is an indirect reference
bool valid_;
// No copying allowed
DBIter(const DBIter&);
void operator=(const DBIter&);
};
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
if (!ParseInternalKey(iter_->key(), ikey)) {
status_ = Status::Corruption("corrupted internal key in DBIter");
return false;
} else {
return true;
}
}
void DBIter::FindNextUserEntry() {
if (large_ != NULL) {
if (status_.ok() && !large_->status.ok()) {
status_ = large_->status;
}
delete large_;
large_ = NULL;
}
while (iter_->Valid()) {
ParsedInternalKey ikey;
if (!ParseKey(&ikey)) {
// Skip past corrupted entry
iter_->Next();
continue;
}
if (ikey.sequence > sequence_) {
// Ignore entries newer than the snapshot
iter_->Next();
continue;
}
switch (ikey.type) {
case kTypeDeletion:
SaveKey(ikey.user_key); // Make local copy for use by SkipPast()
iter_->Next();
SkipPast(key_);
// Do not return deleted entries. Instead keep looping.
break;
case kTypeValue:
SaveKey(ikey.user_key);
SaveValue(iter_->value());
iter_->Next();
SkipPast(key_);
// Yield the value we just found.
valid_ = true;
return;
case kTypeLargeValueRef:
SaveKey(ikey.user_key);
// Save the large value ref as value_, and read it lazily on a call
// to value()
SaveValue(iter_->value());
large_ = new Large;
large_->produced = false;
iter_->Next();
SkipPast(key_);
// Yield the value we just found.
valid_ = true;
return;
}
}
valid_ = false;
key_.clear();
value_.clear();
assert(large_ == NULL);
}
void DBIter::SkipPast(const Slice& k) {
while (iter_->Valid()) {
ParsedInternalKey ikey;
// Note that if we cannot parse an internal key, we keep looping
// so that if we have a run like the following:
// <x,100,v> => value100
// <corrupted entry for user key x>
// <x,50,v> => value50
// we will skip over the corrupted entry as well as value50.
if (ParseKey(&ikey) && user_comparator_->Compare(ikey.user_key, k) != 0) {
break;
}
iter_->Next();
}
}
void DBIter::SeekToLast() {
// Position iter_ at the last uncorrupted user key and then
// let FindPrevUserEntry() do the heavy lifting to find
// a user key that is live.
iter_->SeekToLast();
ParsedInternalKey current;
while (iter_->Valid() && !ParseKey(&current)) {
iter_->Prev();
}
if (iter_->Valid()) {
SaveKey(current.user_key);
}
FindPrevUserEntry();
}
// Let X be the user key at which iter_ is currently positioned.
// Adjust DBIter to point at the last entry with a key <= X that
// has a live value.
void DBIter::FindPrevUserEntry() {
// Consider the following example:
//
// A@540
// A@400
//
// B@300
// B@200
// B@100 <- iter_
//
// C@301
// C@201
//
// The comments marked "(first iteration)" below relate what happens
// for the preceding example in the first iteration of the while loop
// below. There may be more than one iteration either if there are
// no live values for B, or if there is a corruption.
while (iter_->Valid()) {
std::string saved = key_;
bool found_live;
ScanUntilBeforeCurrentKey(&found_live);
// (first iteration) iter_ at A@400
if (found_live) {
// Step forward into range of entries with user key >= saved
if (!iter_->Valid()) {
iter_->SeekToFirst();
} else {
iter_->Next();
}
// (first iteration) iter_ at B@300
FindNextUserEntry(); // Sets key_ to the key of the next value it found
if (valid_ && user_comparator_->Compare(key_, saved) == 0) {
// (first iteration) iter_ at C@301
return;
}
// FindNextUserEntry() could not find any entries under the
// user key "saved". This is probably a corruption since
// ScanUntilBefore(saved) found a live value. So we skip
// backwards to an earlier key and ignore the corrupted
// entries for "saved".
//
// (first iteration) iter_ at C@301 and saved == "B"
key_ = saved;
bool ignored;
ScanUntilBeforeCurrentKey(&ignored);
// (first iteration) iter_ at A@400
}
}
valid_ = false;
key_.clear();
value_.clear();
}
void DBIter::ScanUntilBeforeCurrentKey(bool* found_live) {
*found_live = false;
if (!iter_->Valid()) {
iter_->SeekToLast();
}
while (iter_->Valid()) {
ParsedInternalKey current;
if (!ParseKey(&current)) {
iter_->Prev();
continue;
}
if (current.sequence > sequence_) {
// Ignore entries that are serialized after this read
iter_->Prev();
continue;
}
const int cmp = user_comparator_->Compare(current.user_key, key_);
if (cmp < 0) {
SaveKey(current.user_key);
return;
} else if (cmp == 0) {
switch (current.type) {
case kTypeDeletion:
*found_live = false;
break;
case kTypeValue:
case kTypeLargeValueRef:
*found_live = true;
break;
}
} else { // cmp > 0
*found_live = false;
}
iter_->Prev();
}
}
void DBIter::ReadIndirectValue() const {
assert(!large_->produced);
large_->produced = true;
LargeValueRef large_ref;
if (value_.size() != LargeValueRef::ByteSize()) {
large_->status = Status::Corruption("malformed large value reference");
return;
}
memcpy(large_ref.data, value_.data(), LargeValueRef::ByteSize());
std::string fname = LargeValueFileName(*dbname_, large_ref);
RandomAccessFile* file;
Status s = env_->NewRandomAccessFile(fname, &file);
if (s.ok()) {
uint64_t file_size = file->Size();
uint64_t value_size = large_ref.ValueSize();
large_->value.resize(value_size);
Slice result;
s = file->Read(0, file_size, &result,
const_cast<char*>(large_->value.data()));
if (s.ok()) {
if (result.size() == file_size) {
switch (large_ref.compression_type()) {
case kNoCompression: {
if (result.data() != large_->value.data()) {
large_->value.assign(result.data(), result.size());
}
break;
}
case kLightweightCompression: {
std::string uncompressed;
if (port::Lightweight_Uncompress(result.data(), result.size(),
&uncompressed) &&
uncompressed.size() == large_ref.ValueSize()) {
swap(uncompressed, large_->value);
} else {
s = Status::Corruption(
"Unable to read entire compressed large value file");
}
}
}
} else {
s = Status::Corruption("Unable to read entire large value file");
}
}
delete file; // Ignore errors on closing
}
if (!s.ok()) {
large_->value.clear();
large_->status = s;
}
}
} // anonymous namespace
Iterator* NewDBIterator(
const std::string* dbname,
Env* env,
const Comparator* user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence) {
return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
}
}

26
db/db_iter.h Normal file
View File

@@ -0,0 +1,26 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_
#define STORAGE_LEVELDB_DB_DB_ITER_H_
#include <stdint.h>
#include "include/db.h"
#include "db/dbformat.h"
namespace leveldb {
// Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number
// into appropriate user keys.
extern Iterator* NewDBIterator(
const std::string* dbname,
Env* env,
const Comparator* user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence);
}
#endif // STORAGE_LEVELDB_DB_DB_ITER_H_

963
db/db_test.cc Normal file
View File

@@ -0,0 +1,963 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/db.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "include/env.h"
#include "include/table.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
static std::string RandomString(Random* rnd, int len) {
std::string r;
test::RandomString(rnd, len, &r);
return r;
}
class DBTest {
public:
std::string dbname_;
Env* env_;
DB* db_;
Options last_options_;
DBTest() : env_(Env::Default()) {
dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, Options());
db_ = NULL;
Reopen();
}
~DBTest() {
delete db_;
DestroyDB(dbname_, Options());
}
DBImpl* dbfull() {
return reinterpret_cast<DBImpl*>(db_);
}
void Reopen(Options* options = NULL) {
ASSERT_OK(TryReopen(options));
}
void DestroyAndReopen(Options* options = NULL) {
delete db_;
db_ = NULL;
DestroyDB(dbname_, Options());
ASSERT_OK(TryReopen(options));
}
Status TryReopen(Options* options) {
delete db_;
db_ = NULL;
Options opts;
if (options != NULL) {
opts = *options;
} else {
opts.create_if_missing = true;
}
last_options_ = opts;
return DB::Open(opts, dbname_, &db_);
}
Status Put(const std::string& k, const std::string& v) {
WriteBatch batch;
batch.Put(k, v);
return db_->Write(WriteOptions(), &batch);
}
Status Delete(const std::string& k) {
WriteBatch batch;
batch.Delete(k);
return db_->Write(WriteOptions(), &batch);
}
std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
ReadOptions options;
options.snapshot = snapshot;
std::string result;
Status s = db_->Get(options, k, &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
std::string AllEntriesFor(const Slice& user_key) {
Iterator* iter = dbfull()->TEST_NewInternalIterator();
InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
iter->Seek(target.Encode());
std::string result;
if (!iter->status().ok()) {
result = iter->status().ToString();
} else {
result = "[ ";
bool first = true;
while (iter->Valid()) {
ParsedInternalKey ikey;
if (!ParseInternalKey(iter->key(), &ikey)) {
result += "CORRUPTED";
} else {
if (last_options_.comparator->Compare(
ikey.user_key, user_key) != 0) {
break;
}
if (!first) {
result += ", ";
}
first = false;
switch (ikey.type) {
case kTypeValue:
result += iter->value().ToString();
break;
case kTypeLargeValueRef:
result += "LARGEVALUE(" + EscapeString(iter->value()) + ")";
break;
case kTypeDeletion:
result += "DEL";
break;
}
}
iter->Next();
}
if (!first) {
result += " ";
}
result += "]";
}
delete iter;
return result;
}
int NumTableFilesAtLevel(int level) {
uint64_t val;
ASSERT_TRUE(
db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level),
&val));
return val;
}
uint64_t Size(const Slice& start, const Slice& limit) {
Range r(start, limit);
uint64_t size;
db_->GetApproximateSizes(&r, 1, &size);
return size;
}
std::set<LargeValueRef> LargeValueFiles() const {
// Return the set of large value files that exist in the database
std::vector<std::string> filenames;
env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
uint64_t number;
LargeValueRef large_ref;
FileType type;
std::set<LargeValueRef> live;
for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &large_ref, &type) &&
type == kLargeValueFile) {
fprintf(stderr, " live: %s\n",
LargeValueRefToFilenameString(large_ref).c_str());
live.insert(large_ref);
}
}
fprintf(stderr, "Found %d live large value files\n", (int)live.size());
return live;
}
};
TEST(DBTest, Empty) {
ASSERT_TRUE(db_ != NULL);
ASSERT_EQ("NOT_FOUND", Get("foo"));
}
TEST(DBTest, ReadWrite) {
ASSERT_OK(Put("foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(Put("bar", "v2"));
ASSERT_OK(Put("foo", "v3"));
ASSERT_EQ("v3", Get("foo"));
ASSERT_EQ("v2", Get("bar"));
}
TEST(DBTest, PutDeleteGet) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
ASSERT_EQ("v2", Get("foo"));
ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
ASSERT_EQ("NOT_FOUND", Get("foo"));
}
TEST(DBTest, Recover) {
ASSERT_OK(Put("foo", "v1"));
ASSERT_OK(Put("baz", "v5"));
Reopen();
ASSERT_EQ("v1", Get("foo"));
ASSERT_EQ("v1", Get("foo"));
ASSERT_EQ("v5", Get("baz"));
ASSERT_OK(Put("bar", "v2"));
ASSERT_OK(Put("foo", "v3"));
Reopen();
ASSERT_EQ("v3", Get("foo"));
ASSERT_OK(Put("foo", "v4"));
ASSERT_EQ("v4", Get("foo"));
ASSERT_EQ("v2", Get("bar"));
ASSERT_EQ("v5", Get("baz"));
}
TEST(DBTest, RecoveryWithEmptyLog) {
ASSERT_OK(Put("foo", "v1"));
ASSERT_OK(Put("foo", "v2"));
Reopen();
Reopen();
ASSERT_OK(Put("foo", "v3"));
Reopen();
ASSERT_EQ("v3", Get("foo"));
}
static std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key%06d", i);
return std::string(buf);
}
TEST(DBTest, MinorCompactionsHappen) {
Options options;
options.write_buffer_size = 10000;
Reopen(&options);
const int N = 100;
int starting_num_tables = NumTableFilesAtLevel(0);
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v')));
}
int ending_num_tables = NumTableFilesAtLevel(0);
ASSERT_GT(ending_num_tables, starting_num_tables);
for (int i = 0; i < N; i++) {
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
}
Reopen();
for (int i = 0; i < N; i++) {
ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
}
}
TEST(DBTest, RecoverWithLargeLog) {
{
Options options;
options.large_value_threshold = 1048576;
Reopen(&options);
ASSERT_OK(Put("big1", std::string(200000, '1')));
ASSERT_OK(Put("big2", std::string(200000, '2')));
ASSERT_OK(Put("small3", std::string(10, '3')));
ASSERT_OK(Put("small4", std::string(10, '4')));
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
}
// Make sure that if we re-open with a small write buffer size that
// we flush table files in the middle of a large log file.
Options options;
options.write_buffer_size = 100000;
options.large_value_threshold = 1048576;
Reopen(&options);
ASSERT_EQ(NumTableFilesAtLevel(0), 3);
ASSERT_EQ(std::string(200000, '1'), Get("big1"));
ASSERT_EQ(std::string(200000, '2'), Get("big2"));
ASSERT_EQ(std::string(10, '3'), Get("small3"));
ASSERT_EQ(std::string(10, '4'), Get("small4"));
ASSERT_GT(NumTableFilesAtLevel(0), 1);
}
TEST(DBTest, CompactionsGenerateMultipleFiles) {
Options options;
options.write_buffer_size = 100000000; // Large write buffer
options.large_value_threshold = 1048576;
Reopen(&options);
Random rnd(301);
// Write 8MB (80 values, each 100K)
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
std::vector<std::string> values;
for (int i = 0; i < 80; i++) {
values.push_back(RandomString(&rnd, 100000));
ASSERT_OK(Put(Key(i), values[i]));
}
// Reopening moves updates to level-0
Reopen(&options);
dbfull()->TEST_CompactRange(0, "", Key(100000));
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GT(NumTableFilesAtLevel(1), 1);
for (int i = 0; i < 80; i++) {
ASSERT_EQ(Get(Key(i)), values[i]);
}
}
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
bool result = (val >= low) && (val <= high);
if (!result) {
fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
(unsigned long long)(val),
(unsigned long long)(low),
(unsigned long long)(high));
}
return result;
}
TEST(DBTest, ApproximateSizes) {
for (int test = 0; test < 2; test++) {
// test==0: default large_value_threshold
// test==1: 1 MB large_value_threshold
Options options;
options.large_value_threshold = (test == 0) ? 65536 : 1048576;
options.write_buffer_size = 100000000; // Large write buffer
options.compression = kNoCompression;
DestroyAndReopen();
ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
Reopen(&options);
ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
// Write 8MB (80 values, each 100K)
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
const int N = 80;
Random rnd(301);
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000)));
}
if (test == 1) {
// 0 because GetApproximateSizes() does not account for memtable space for
// non-large values
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
} else {
ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000));
ASSERT_TRUE(Between(Size(Key(20), Key(30)),
100000*10, 100000*10 + 10000));
}
// Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) {
Reopen(&options);
for (int compact_start = 0; compact_start < N; compact_start += 10) {
for (int i = 0; i < N; i += 10) {
ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000));
ASSERT_TRUE(Between(Size("", Key(i)+".suffix"),
100000 * (i+1), 100000 * (i+1) + 10000));
ASSERT_TRUE(Between(Size(Key(i), Key(i+10)),
100000 * 10, 100000 * 10 + 10000));
}
ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000));
ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000));
dbfull()->TEST_CompactRange(0,
Key(compact_start),
Key(compact_start + 9));
}
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GT(NumTableFilesAtLevel(1), 0);
}
}
}
TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
Options options;
options.large_value_threshold = 65536;
options.compression = kNoCompression;
Reopen();
Random rnd(301);
std::string big1 = RandomString(&rnd, 100000);
ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(2), big1));
ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(4), big1));
ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000)));
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
// Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) {
Reopen(&options);
ASSERT_TRUE(Between(Size("", Key(0)), 0, 0));
ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000));
ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000));
ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000));
ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000));
ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000));
ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
dbfull()->TEST_CompactRange(0, Key(0), Key(100));
}
}
TEST(DBTest, IteratorPinsRef) {
Put("foo", "hello");
// Get iterator that will yield the current contents of the DB.
Iterator* iter = db_->NewIterator(ReadOptions());
// Write to force compactions
Put("foo", "newvalue1");
for (int i = 0; i < 100; i++) {
ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values
}
Put("foo", "newvalue2");
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("foo", iter->key().ToString());
ASSERT_EQ("hello", iter->value().ToString());
iter->Next();
ASSERT_TRUE(!iter->Valid());
delete iter;
}
TEST(DBTest, Snapshot) {
Put("foo", "v1");
const Snapshot* s1 = db_->GetSnapshot();
Put("foo", "v2");
const Snapshot* s2 = db_->GetSnapshot();
Put("foo", "v3");
const Snapshot* s3 = db_->GetSnapshot();
Put("foo", "v4");
ASSERT_EQ("v1", Get("foo", s1));
ASSERT_EQ("v2", Get("foo", s2));
ASSERT_EQ("v3", Get("foo", s3));
ASSERT_EQ("v4", Get("foo"));
db_->ReleaseSnapshot(s3);
ASSERT_EQ("v1", Get("foo", s1));
ASSERT_EQ("v2", Get("foo", s2));
ASSERT_EQ("v4", Get("foo"));
db_->ReleaseSnapshot(s1);
ASSERT_EQ("v2", Get("foo", s2));
ASSERT_EQ("v4", Get("foo"));
db_->ReleaseSnapshot(s2);
ASSERT_EQ("v4", Get("foo"));
}
TEST(DBTest, HiddenValuesAreRemoved) {
Random rnd(301);
std::string big = RandomString(&rnd, 50000);
Put("foo", big);
Put("pastfoo", "v");
const Snapshot* snapshot = db_->GetSnapshot();
Put("foo", "tiny");
Put("pastfoo2", "v2"); // Advance sequence number one more
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_GT(NumTableFilesAtLevel(0), 0);
ASSERT_EQ(big, Get("foo", snapshot));
ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
db_->ReleaseSnapshot(snapshot);
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
dbfull()->TEST_CompactRange(0, "", "x");
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_GE(NumTableFilesAtLevel(1), 1);
dbfull()->TEST_CompactRange(1, "", "x");
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
}
TEST(DBTest, DeletionMarkers1) {
Put("foo", "v1");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
dbfull()->TEST_CompactRange(0, "", "z");
dbfull()->TEST_CompactRange(1, "", "z");
ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file
Delete("foo");
Put("foo", "v2");
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
dbfull()->TEST_CompactRange(0, "", "z");
// DEL eliminated, but v1 remains because we aren't compacting that level
// (DEL can be eliminated because v2 hides v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
dbfull()->TEST_CompactRange(1, "", "z");
// Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed.
// (as is v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
}
TEST(DBTest, DeletionMarkers2) {
Put("foo", "v1");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
dbfull()->TEST_CompactRange(0, "", "z");
dbfull()->TEST_CompactRange(1, "", "z");
ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file
Delete("foo");
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(0, "", "z");
// DEL kept: L2 file overlaps
ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
dbfull()->TEST_CompactRange(1, "", "z");
// Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed.
// (as is v1).
ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
}
TEST(DBTest, ComparatorCheck) {
class NewComparator : public Comparator {
public:
virtual const char* Name() const { return "leveldb.NewComparator"; }
virtual int Compare(const Slice& a, const Slice& b) const {
return BytewiseComparator()->Compare(a, b);
}
virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
BytewiseComparator()->FindShortestSeparator(s, l);
}
virtual void FindShortSuccessor(std::string* key) const {
BytewiseComparator()->FindShortSuccessor(key);
}
};
NewComparator cmp;
Options new_options;
new_options.comparator = &cmp;
Status s = TryReopen(&new_options);
ASSERT_TRUE(!s.ok());
ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
<< s.ToString();
}
static bool LargeValuesOK(DBTest* db,
const std::set<LargeValueRef>& expected) {
std::set<LargeValueRef> actual = db->LargeValueFiles();
if (actual.size() != expected.size()) {
fprintf(stderr, "Sets differ in size: %d vs %d\n",
(int)actual.size(), (int)expected.size());
return false;
}
for (std::set<LargeValueRef>::const_iterator it = expected.begin();
it != expected.end();
++it) {
if (actual.count(*it) != 1) {
fprintf(stderr, " key '%s' not found in actual set\n",
LargeValueRefToFilenameString(*it).c_str());
return false;
}
}
return true;
}
TEST(DBTest, LargeValues1) {
Options options;
options.large_value_threshold = 10000;
Reopen(&options);
Random rnd(301);
std::string big1;
test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible
std::set<LargeValueRef> expected;
ASSERT_OK(Put("big1", big1));
expected.insert(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Delete("big1"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
// No handling of deletion markers on memtable compactions, so big1 remains
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
expected.erase(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
}
TEST(DBTest, LargeValues2) {
Options options;
options.large_value_threshold = 10000;
Reopen(&options);
Random rnd(301);
std::string big1, big2;
test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible
test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible
std::set<LargeValueRef> expected;
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("big1", big1));
expected.insert(LargeValueRef::Make(big1, kNoCompression));
ASSERT_EQ(big1, Get("big1"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("big2", big2));
ASSERT_EQ(big2, Get("big2"));
#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM)
// TODO(sanjay) Reenable after compression support is added
expected.insert(LargeValueRef::Make(big2, kNoCompression));
#else
expected.insert(LargeValueRef::Make(big2, kLightweightCompression));
#endif
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("big2", big2));
ASSERT_OK(Put("big2_b", big2));
ASSERT_EQ(big1, Get("big1"));
ASSERT_EQ(big2, Get("big2"));
ASSERT_EQ(big2, Get("big2_b"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Delete("big1"));
ASSERT_EQ("NOT_FOUND", Get("big1"));
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
expected.erase(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(1, "", "z");
ASSERT_OK(Delete("big2"));
ASSERT_EQ("NOT_FOUND", Get("big2"));
ASSERT_EQ(big2, Get("big2_b"));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_TRUE(LargeValuesOK(this, expected));
dbfull()->TEST_CompactRange(0, "", "z");
ASSERT_TRUE(LargeValuesOK(this, expected));
// Make sure the large value refs survive a reload and compactions after
// the reload.
Reopen();
ASSERT_TRUE(LargeValuesOK(this, expected));
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(dbfull()->TEST_CompactMemTable());
dbfull()->TEST_CompactRange(0, "", "z");
ASSERT_TRUE(LargeValuesOK(this, expected));
}
TEST(DBTest, LargeValues3) {
// Make sure we don't compress values if
Options options;
options.large_value_threshold = 10000;
options.compression = kNoCompression;
Reopen(&options);
Random rnd(301);
std::string big1 = std::string(100000, 'x'); // Very compressible
std::set<LargeValueRef> expected;
ASSERT_OK(Put("big1", big1));
ASSERT_EQ(big1, Get("big1"));
expected.insert(LargeValueRef::Make(big1, kNoCompression));
ASSERT_TRUE(LargeValuesOK(this, expected));
}
TEST(DBTest, DBOpen_Options) {
std::string dbname = test::TmpDir() + "/db_options_test";
DestroyDB(dbname, Options());
// Does not exist, and create_if_missing == false: error
DB* db = NULL;
Options opts;
opts.create_if_missing = false;
Status s = DB::Open(opts, dbname, &db);
ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL);
ASSERT_TRUE(db == NULL);
// Does not exist, and create_if_missing == true: OK
opts.create_if_missing = true;
s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);
ASSERT_TRUE(db != NULL);
delete db;
db = NULL;
// Does exist, and error_if_exists == true: error
opts.create_if_missing = false;
opts.error_if_exists = true;
s = DB::Open(opts, dbname, &db);
ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL);
ASSERT_TRUE(db == NULL);
// Does exist, and error_if_exists == false: OK
opts.create_if_missing = true;
opts.error_if_exists = false;
s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);
ASSERT_TRUE(db != NULL);
delete db;
db = NULL;
}
class ModelDB: public DB {
public:
explicit ModelDB(const Options& options): options_(options) { }
~ModelDB() { }
virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
return DB::Put(o, k, v);
}
virtual Status Delete(const WriteOptions& o, const Slice& key) {
return DB::Delete(o, key);
}
virtual Status Get(const ReadOptions& options,
const Slice& key, std::string* value) {
assert(false); // Not implemented
return Status::NotFound(key);
}
virtual Iterator* NewIterator(const ReadOptions& options) {
if (options.snapshot == NULL) {
KVMap* saved = new KVMap;
*saved = map_;
return new ModelIter(saved, true);
} else {
const KVMap* snapshot_state =
reinterpret_cast<const KVMap*>(options.snapshot->number_);
return new ModelIter(snapshot_state, false);
}
}
virtual const Snapshot* GetSnapshot() {
KVMap* saved = new KVMap;
*saved = map_;
return snapshots_.New(
reinterpret_cast<SequenceNumber>(saved));
}
virtual void ReleaseSnapshot(const Snapshot* snapshot) {
const KVMap* saved = reinterpret_cast<const KVMap*>(snapshot->number_);
delete saved;
snapshots_.Delete(snapshot);
}
virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
assert(options.post_write_snapshot == NULL); // Not supported
for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) {
switch (it.op()) {
case kTypeValue:
map_[it.key().ToString()] = it.value().ToString();
break;
case kTypeLargeValueRef:
assert(false); // Should not occur
break;
case kTypeDeletion:
map_.erase(it.key().ToString());
break;
}
}
return Status::OK();
}
virtual bool GetProperty(const Slice& property, uint64_t* value) {
return false;
}
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
for (int i = 0; i < n; i++) {
sizes[i] = 0;
}
}
private:
typedef std::map<std::string, std::string> KVMap;
class ModelIter: public Iterator {
public:
ModelIter(const KVMap* map, bool owned)
: map_(map), owned_(owned), iter_(map_->end()) {
}
~ModelIter() {
if (owned_) delete map_;
}
virtual bool Valid() const { return iter_ != map_->end(); }
virtual void SeekToFirst() { iter_ = map_->begin(); }
virtual void SeekToLast() {
if (map_->empty()) {
iter_ = map_->end();
} else {
iter_ = map_->find(map_->rbegin()->first);
}
}
virtual void Seek(const Slice& k) {
iter_ = map_->lower_bound(k.ToString());
}
virtual void Next() { ++iter_; }
virtual void Prev() { --iter_; }
virtual Slice key() const { return iter_->first; }
virtual Slice value() const { return iter_->second; }
virtual Status status() const { return Status::OK(); }
private:
const KVMap* const map_;
const bool owned_; // Do we own map_
KVMap::const_iterator iter_;
};
const Options options_;
KVMap map_;
SnapshotList snapshots_;
};
static std::string RandomKey(Random* rnd) {
int len = (rnd->OneIn(3)
? 1 // Short sometimes to encourage collisions
: (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
return test::RandomKey(rnd, len);
}
static bool CompareIterators(int step,
DB* model,
DB* db,
const Snapshot* model_snap,
const Snapshot* db_snap) {
ReadOptions options;
options.snapshot = model_snap;
Iterator* miter = model->NewIterator(options);
options.snapshot = db_snap;
Iterator* dbiter = db->NewIterator(options);
bool ok = true;
int count = 0;
for (miter->SeekToFirst(), dbiter->SeekToFirst();
ok && miter->Valid() && dbiter->Valid();
miter->Next(), dbiter->Next()) {
count++;
if (miter->key().compare(dbiter->key()) != 0) {
fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
step,
EscapeString(miter->key()).c_str(),
EscapeString(dbiter->key()).c_str());
ok = false;
break;
}
if (miter->value().compare(dbiter->value()) != 0) {
fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
step,
EscapeString(miter->key()).c_str(),
EscapeString(miter->value()).c_str(),
EscapeString(miter->value()).c_str());
ok = false;
}
}
if (ok) {
if (miter->Valid() != dbiter->Valid()) {
fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
step, miter->Valid(), dbiter->Valid());
ok = false;
}
}
fprintf(stderr, "%d entries compared: ok=%d\n", count, ok);
delete miter;
delete dbiter;
return ok;
}
TEST(DBTest, Randomized) {
Random rnd(test::RandomSeed());
ModelDB model(last_options_);
const int N = 10000;
const Snapshot* model_snap = NULL;
const Snapshot* db_snap = NULL;
std::string k, v;
for (int step = 0; step < N; step++) {
if (step % 100 == 0) {
fprintf(stderr, "Step %d of %d\n", step, N);
}
int p = rnd.Uniform(100);
if (p < 45) { // Put
k = RandomKey(&rnd);
v = RandomString(&rnd,
rnd.OneIn(20)
? 100 + rnd.Uniform(100)
: rnd.Uniform(8));
ASSERT_OK(model.Put(WriteOptions(), k, v));
ASSERT_OK(db_->Put(WriteOptions(), k, v));
} else if (p < 90) { // Delete
k = RandomKey(&rnd);
ASSERT_OK(model.Delete(WriteOptions(), k));
ASSERT_OK(db_->Delete(WriteOptions(), k));
} else { // Multi-element batch
WriteBatch b;
const int num = rnd.Uniform(8);
for (int i = 0; i < num; i++) {
if (i == 0 || !rnd.OneIn(10)) {
k = RandomKey(&rnd);
} else {
// Periodically re-use the same key from the previous iter, so
// we have multiple entries in the write batch for the same key
}
if (rnd.OneIn(2)) {
v = RandomString(&rnd, rnd.Uniform(10));
b.Put(k, v);
} else {
b.Delete(k);
}
}
ASSERT_OK(model.Write(WriteOptions(), &b));
ASSERT_OK(db_->Write(WriteOptions(), &b));
}
if ((step % 100) == 0) {
ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
// Save a snapshot from each DB this time that we'll use next
// time we compare things, to make sure the current state is
// preserved with the snapshot
if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
Reopen();
ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
model_snap = model.GetSnapshot();
db_snap = db_->GetSnapshot();
}
}
if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

152
db/dbformat.cc Normal file
View File

@@ -0,0 +1,152 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
#include "db/dbformat.h"
#include "port/port.h"
#include "util/coding.h"
namespace leveldb {
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
assert(seq <= kMaxSequenceNumber);
assert(t <= kValueTypeForSeek);
return (seq << 8) | t;
}
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
result->append(key.user_key.data(), key.user_key.size());
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
}
std::string ParsedInternalKey::DebugString() const {
char buf[50];
snprintf(buf, sizeof(buf), "' @ %llu : %d",
(unsigned long long) sequence,
int(type));
std::string result = "'";
result += user_key.ToString();
result += buf;
return result;
}
const char* InternalKeyComparator::Name() const {
return "leveldb.InternalKeyComparator";
}
int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
if (r == 0) {
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
if (anum > bnum) {
r = -1;
} else if (anum < bnum) {
r = +1;
}
}
return r;
}
void InternalKeyComparator::FindShortestSeparator(
std::string* start,
const Slice& limit) const {
// Attempt to shorten the user portion of the key
Slice user_start = ExtractUserKey(*start);
Slice user_limit = ExtractUserKey(limit);
std::string tmp(user_start.data(), user_start.size());
user_comparator_->FindShortestSeparator(&tmp, user_limit);
if (user_comparator_->Compare(*start, tmp) < 0) {
// User key has become larger. Tack on the earliest possible
// number to the shortened user key.
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
assert(this->Compare(*start, tmp) < 0);
assert(this->Compare(tmp, limit) < 0);
start->swap(tmp);
}
}
void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
Slice user_key = ExtractUserKey(*key);
std::string tmp(user_key.data(), user_key.size());
user_comparator_->FindShortSuccessor(&tmp);
if (user_comparator_->Compare(user_key, tmp) < 0) {
// User key has become larger. Tack on the earliest possible
// number to the shortened user key.
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
assert(this->Compare(*key, tmp) < 0);
key->swap(tmp);
}
}
LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) {
LargeValueRef result;
port::SHA1_Hash(value.data(), value.size(), &result.data[0]);
EncodeFixed64(&result.data[20], value.size());
result.data[28] = static_cast<unsigned char>(ctype);
return result;
}
std::string LargeValueRefToFilenameString(const LargeValueRef& h) {
assert(sizeof(h.data) == LargeValueRef::ByteSize());
assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf
static const char tohex[] = "0123456789abcdef";
char buf[20*2];
for (int i = 0; i < 20; i++) {
buf[2*i] = tohex[(h.data[i] >> 4) & 0xf];
buf[2*i+1] = tohex[h.data[i] & 0xf];
}
std::string result = std::string(buf, sizeof(buf));
result += "-";
result += NumberToString(h.ValueSize());
result += "-";
result += NumberToString(static_cast<uint64_t>(h.compression_type()));
return result;
}
static uint32_t hexvalue(char c) {
if (c >= '0' && c <= '9') {
return c - '0';
} else if (c >= 'A' && c <= 'F') {
return 10 + c - 'A';
} else {
assert(c >= 'a' && c <= 'f');
return 10 + c - 'a';
}
}
bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) {
Slice in = s;
if (in.size() < 40) {
return false;
}
for (int i = 0; i < 20; i++) {
if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) {
return false;
}
unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]);
h->data[i] = c;
}
in.remove_prefix(40);
uint64_t value_size, ctype;
if (ConsumeChar(&in, '-') &&
ConsumeDecimalNumber(&in, &value_size) &&
ConsumeChar(&in, '-') &&
ConsumeDecimalNumber(&in, &ctype) &&
in.empty() &&
(ctype <= kLightweightCompression)) {
EncodeFixed64(&h->data[20], value_size);
h->data[28] = static_cast<unsigned char>(ctype);
return true;
} else {
return false;
}
}
}

198
db/dbformat.h Normal file
View File

@@ -0,0 +1,198 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
#define STORAGE_LEVELDB_DB_FORMAT_H_
#include <stdio.h>
#include "include/comparator.h"
#include "include/db.h"
#include "include/slice.h"
#include "include/table_builder.h"
#include "util/coding.h"
#include "util/logging.h"
namespace leveldb {
class InternalKey;
// Value types encoded as the last component of internal keys.
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
// data structures.
enum ValueType {
kTypeDeletion = 0x0,
kTypeValue = 0x1,
kTypeLargeValueRef = 0x2,
};
// kValueTypeForSeek defines the ValueType that should be passed when
// constructing a ParsedInternalKey object for seeking to a particular
// sequence number (since we sort sequence numbers in decreasing order
// and the value type is embedded as the low 8 bits in the sequence
// number in internal keys, we need to use the highest-numbered
// ValueType, not the lowest).
static const ValueType kValueTypeForSeek = kTypeLargeValueRef;
typedef uint64_t SequenceNumber;
// We leave eight bits empty at the bottom so a type and sequence#
// can be packed together into 64-bits.
static const SequenceNumber kMaxSequenceNumber =
((0x1ull << 56) - 1);
struct ParsedInternalKey {
Slice user_key;
SequenceNumber sequence;
ValueType type;
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
: user_key(u), sequence(seq), type(t) { }
std::string DebugString() const;
};
// Return the length of the encoding of "key".
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
return key.user_key.size() + 8;
}
// Append the serialization of "key" to *result.
extern void AppendInternalKey(std::string* result,
const ParsedInternalKey& key);
// Attempt to parse an internal key from "internal_key". On success,
// stores the parsed data in "*result", and returns true.
//
// On error, returns false, leaves "*result" in an undefined state.
extern bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result);
// Returns the user key portion of an internal key.
inline Slice ExtractUserKey(const Slice& internal_key) {
assert(internal_key.size() >= 8);
return Slice(internal_key.data(), internal_key.size() - 8);
}
inline ValueType ExtractValueType(const Slice& internal_key) {
assert(internal_key.size() >= 8);
const size_t n = internal_key.size();
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
unsigned char c = num & 0xff;
return static_cast<ValueType>(c);
}
// A comparator for internal keys that uses a specified comparator for
// the user key portion and breaks ties by decreasing sequence number.
class InternalKeyComparator : public Comparator {
private:
const Comparator* user_comparator_;
public:
explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { }
virtual const char* Name() const;
virtual int Compare(const Slice& a, const Slice& b) const;
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const;
virtual void FindShortSuccessor(std::string* key) const;
const Comparator* user_comparator() const { return user_comparator_; }
int Compare(const InternalKey& a, const InternalKey& b) const;
};
// Modules in this directory should keep internal keys wrapped inside
// the following class instead of plain strings so that we do not
// incorrectly use string comparisons instead of an InternalKeyComparator.
class InternalKey {
private:
std::string rep_;
public:
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
}
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
Slice Encode() const {
assert(!rep_.empty());
return rep_;
}
Slice user_key() const { return ExtractUserKey(rep_); }
void SetFrom(const ParsedInternalKey& p) {
rep_.clear();
AppendInternalKey(&rep_, p);
}
void Clear() { rep_.clear(); }
};
inline int InternalKeyComparator::Compare(
const InternalKey& a, const InternalKey& b) const {
return Compare(a.Encode(), b.Encode());
}
// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte
// uncompressed size, and a 1 byte CompressionType code. An
// encoded form of it is embedded in the filenames of large value
// files stored in the database, and the raw binary form is stored as
// the iter->value() result for values of type kTypeLargeValueRef in
// the table and log files that make up the database.
struct LargeValueRef {
char data[29];
// Initialize a large value ref for the given data
static LargeValueRef Make(const Slice& data,
CompressionType compression_type);
// Initialize a large value ref from a serialized, 29-byte reference value
static LargeValueRef FromRef(const Slice& ref) {
LargeValueRef result;
assert(ref.size() == sizeof(result.data));
memcpy(result.data, ref.data(), sizeof(result.data));
return result;
}
// Return the number of bytes in a LargeValueRef (not the
// number of bytes in the value referenced).
static size_t ByteSize() { return sizeof(LargeValueRef().data); }
// Return the number of bytes in the value referenced by "*this".
uint64_t ValueSize() const { return DecodeFixed64(&data[20]); }
CompressionType compression_type() const {
return static_cast<CompressionType>(data[28]);
}
bool operator==(const LargeValueRef& b) const {
return memcmp(data, b.data, sizeof(data)) == 0;
}
bool operator<(const LargeValueRef& b) const {
return memcmp(data, b.data, sizeof(data)) < 0;
}
};
// Convert the large value ref to a human-readable string suitable
// for embedding in a large value filename.
extern std::string LargeValueRefToFilenameString(const LargeValueRef& h);
// Parse the large value filename string in "input" and store it in
// "*h". If successful, returns true. Otherwise returns false.
extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref);
inline bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result) {
const size_t n = internal_key.size();
if (n < 8) return false;
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
unsigned char c = num & 0xff;
result->sequence = num >> 8;
result->type = static_cast<ValueType>(c);
result->user_key = Slice(internal_key.data(), n - 8);
return (c <= static_cast<unsigned char>(kTypeLargeValueRef));
}
}
#endif // STORAGE_LEVELDB_DB_FORMAT_H_

127
db/dbformat_test.cc Normal file
View File

@@ -0,0 +1,127 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/dbformat.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
static std::string IKey(const std::string& user_key,
uint64_t seq,
ValueType vt) {
std::string encoded;
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
return encoded;
}
static std::string Shorten(const std::string& s, const std::string& l) {
std::string result = s;
InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
return result;
}
static std::string ShortSuccessor(const std::string& s) {
std::string result = s;
InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
return result;
}
static void TestKey(const std::string& key,
uint64_t seq,
ValueType vt) {
std::string encoded = IKey(key, seq, vt);
Slice in(encoded);
ParsedInternalKey decoded("", 0, kTypeValue);
ASSERT_TRUE(ParseInternalKey(in, &decoded));
ASSERT_EQ(key, decoded.user_key.ToString());
ASSERT_EQ(seq, decoded.sequence);
ASSERT_EQ(vt, decoded.type);
ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
}
class FormatTest { };
TEST(FormatTest, InternalKey_EncodeDecode) {
const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
const uint64_t seq[] = {
1, 2, 3,
(1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
(1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
(1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
};
for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
TestKey(keys[k], seq[s], kTypeValue);
TestKey("hello", 1, kTypeDeletion);
}
}
}
TEST(FormatTest, InternalKeyShortSeparator) {
// When user keys are same
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 99, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 101, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeDeletion)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeLargeValueRef)));
// When user keys are misordered
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("bar", 99, kTypeValue)));
// When user keys are different, but correctly ordered
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
Shorten(IKey("foo", 100, kTypeValue),
IKey("hello", 200, kTypeValue)));
// When start user key is prefix of limit user key
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foobar", 200, kTypeValue)));
// When limit user key is prefix of start user key
ASSERT_EQ(IKey("foobar", 100, kTypeValue),
Shorten(IKey("foobar", 100, kTypeValue),
IKey("foo", 200, kTypeValue)));
}
TEST(FormatTest, InternalKeyShortestSuccessor) {
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
ShortSuccessor(IKey("foo", 100, kTypeValue)));
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
}
TEST(FormatTest, SHA1) {
// Check that we are computing the same value as sha1.
// Note that the last two numbers are the length of the input and the
// compression type.
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr
LargeValueRefToFilenameString(
LargeValueRef::Make("hello", kNoCompression)));
ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr
LargeValueRefToFilenameString(
LargeValueRef::Make("hello", kLightweightCompression)));
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

154
db/filename.cc Normal file
View File

@@ -0,0 +1,154 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <ctype.h>
#include <stdio.h>
#include "db/filename.h"
#include "db/dbformat.h"
#include "include/env.h"
#include "util/logging.h"
namespace leveldb {
static std::string MakeFileName(const std::string& name, uint64_t number,
const char* suffix) {
char buf[100];
snprintf(buf, sizeof(buf), "/%06llu.%s",
static_cast<unsigned long long>(number),
suffix);
return name + buf;
}
std::string LogFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "log");
}
std::string TableFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "sst");
}
std::string LargeValueFileName(const std::string& name,
const LargeValueRef& large_ref) {
std::string result = name + "/";
result += LargeValueRefToFilenameString(large_ref);
result += ".val";
return result;
}
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
assert(number > 0);
char buf[100];
snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
static_cast<unsigned long long>(number));
return dbname + buf;
}
std::string CurrentFileName(const std::string& dbname) {
return dbname + "/CURRENT";
}
std::string LockFileName(const std::string& dbname) {
return dbname + "/LOCK";
}
std::string TempFileName(const std::string& dbname, uint64_t number) {
assert(number > 0);
return MakeFileName(dbname, number, "dbtmp");
}
std::string InfoLogFileName(const std::string& dbname) {
return dbname + "/LOG";
}
// Return the name of the old info log file for "dbname".
std::string OldInfoLogFileName(const std::string& dbname) {
return dbname + "/LOG.old";
}
// Owned filenames have the form:
// dbname/CURRENT
// dbname/LOCK
// dbname/LOG
// dbname/LOG.old
// dbname/MANIFEST-[0-9]+
// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val
// dbname/[0-9]+.(log|sst)
bool ParseFileName(const std::string& fname,
uint64_t* number,
LargeValueRef* large_ref,
FileType* type) {
Slice rest(fname);
if (rest == "CURRENT") {
*number = 0;
*type = kCurrentFile;
} else if (rest == "LOCK") {
*number = 0;
*type = kDBLockFile;
} else if (rest == "LOG" || rest == "LOG.old") {
*number = 0;
*type = kInfoLogFile;
} else if (rest.size() >= 4 &&
Slice(rest.data() + rest.size() - 4, 4) == ".val") {
LargeValueRef h;
if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4),
&h)) {
return false;
}
*large_ref = h;
*type = kLargeValueFile;
} else if (rest.starts_with("MANIFEST-")) {
rest.remove_prefix(strlen("MANIFEST-"));
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
if (!rest.empty()) {
return false;
}
*type = kDescriptorFile;
*number = num;
} else {
// Avoid strtoull() to keep filename format independent of the
// current locale
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
Slice suffix = rest;
if (suffix == Slice(".log")) {
*type = kLogFile;
} else if (suffix == Slice(".sst")) {
*type = kTableFile;
} else if (suffix == Slice(".dbtmp")) {
*type = kTempFile;
} else {
return false;
}
*number = num;
}
return true;
}
Status SetCurrentFile(Env* env, const std::string& dbname,
uint64_t descriptor_number) {
// Remove leading "dbname/" and add newline to manifest file name
std::string manifest = DescriptorFileName(dbname, descriptor_number);
Slice contents = manifest;
assert(contents.starts_with(dbname + "/"));
contents.remove_prefix(dbname.size() + 1);
std::string tmp = TempFileName(dbname, descriptor_number);
Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp);
if (s.ok()) {
s = env->RenameFile(tmp, CurrentFileName(dbname));
}
if (!s.ok()) {
env->DeleteFile(tmp);
}
return s;
}
}

92
db/filename.h Normal file
View File

@@ -0,0 +1,92 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// File names used by DB code
#ifndef STORAGE_LEVELDB_DB_FILENAME_H_
#define STORAGE_LEVELDB_DB_FILENAME_H_
#include <stdint.h>
#include <string>
#include "include/slice.h"
#include "include/status.h"
#include "port/port.h"
namespace leveldb {
class Env;
struct LargeValueRef;
enum FileType {
kLogFile,
kDBLockFile,
kTableFile,
kLargeValueFile,
kDescriptorFile,
kCurrentFile,
kTempFile,
kInfoLogFile, // Either the current one, or an old one
};
// Return the name of the log file with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string LogFileName(const std::string& dbname, uint64_t number);
// Return the name of the sstable with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string TableFileName(const std::string& dbname, uint64_t number);
// Return the name of the large value file with the specified large
// value reference in the db named by "dbname". The result will be
// prefixed with "dbname".
extern std::string LargeValueFileName(const std::string& dbname,
const LargeValueRef& large_ref);
// Return the name of the descriptor file for the db named by
// "dbname" and the specified incarnation number. The result will be
// prefixed with "dbname".
extern std::string DescriptorFileName(const std::string& dbname,
uint64_t number);
// Return the name of the current file. This file contains the name
// of the current manifest file. The result will be prefixed with
// "dbname".
extern std::string CurrentFileName(const std::string& dbname);
// Return the name of the lock file for the db named by
// "dbname". The result will be prefixed with "dbname".
extern std::string LockFileName(const std::string& dbname);
// Return the name of a temporary file owned by the db named "dbname".
// The result will be prefixed with "dbname".
extern std::string TempFileName(const std::string& dbname, uint64_t number);
// Return the name of the info log file for "dbname".
extern std::string InfoLogFileName(const std::string& dbname);
// Return the name of the old info log file for "dbname".
extern std::string OldInfoLogFileName(const std::string& dbname);
// If filename is a leveldb file, store the type of the file in *type.
// If *type is kLargeValueFile, then the large value reference data
// from the filename is stored in "*large_ref. For all other types of
// files, the number encoded in the filename is stored in *number. If
// the filename was successfully parsed, returns true. Else return
// false.
extern bool ParseFileName(const std::string& filename,
uint64_t* number,
LargeValueRef* large_ref,
FileType* type);
// Make the CURRENT file point to the descriptor file with the
// specified number.
extern Status SetCurrentFile(Env* env, const std::string& dbname,
uint64_t descriptor_number);
}
#endif // STORAGE_LEVELDB_DB_FILENAME_H_

156
db/filename_test.cc Normal file
View File

@@ -0,0 +1,156 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/filename.h"
#include "db/dbformat.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
class FileNameTest { };
TEST(FileNameTest, Parse) {
Slice db;
FileType type;
uint64_t number;
LargeValueRef large_ref;
// Successful parses
static struct {
const char* fname;
uint64_t number;
const char* large_ref;
FileType type;
} cases[] = {
{ "100.log", 100, "", kLogFile },
{ "0.log", 0, "", kLogFile },
{ "0.sst", 0, "", kTableFile },
{ "CURRENT", 0, "", kCurrentFile },
{ "LOCK", 0, "", kDBLockFile },
{ "MANIFEST-2", 2, "", kDescriptorFile },
{ "MANIFEST-7", 7, "", kDescriptorFile },
{ "LOG", 0, "", kInfoLogFile },
{ "LOG.old", 0, "", kInfoLogFile },
{ "18446744073709551615.log", 18446744073709551615ull, "",
kLogFile },
{ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0,
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile },
{ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0,
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0",
kLargeValueFile },
};
for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
std::string f = cases[i].fname;
ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f;
ASSERT_EQ(cases[i].type, type) << f;
if (type == kLargeValueFile) {
ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref))
<< f;
} else {
ASSERT_EQ(cases[i].number, number) << f;
}
}
// Errors
static const char* errors[] = {
"",
"foo",
"foo-dx-100.log",
".log",
"",
"manifest",
"CURREN",
"CURRENTX",
"MANIFES",
"MANIFEST",
"MANIFEST-",
"XMANIFEST-3",
"MANIFEST-3x",
"LOC",
"LOCKx",
"LO",
"LOGx",
"18446744073709551616.log",
"184467440737095516150.log",
"100",
"100.",
"100.lop",
"100.val",
".val",
"123456789012345678901234567890123456789-12340.val",
"1234567890123456789012345678901234567-123-0.val",
"12345678901234567890123456789012345678902-100-1-.val",
// Overflow on value size
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val",
// '03.val' is a bad compression type
"2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" };
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
std::string f = errors[i];
ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f;
};
}
TEST(FileNameTest, Construction) {
uint64_t number;
FileType type;
LargeValueRef large_ref;
std::string fname;
fname = CurrentFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(0, number);
ASSERT_EQ(kCurrentFile, type);
fname = LockFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(0, number);
ASSERT_EQ(kDBLockFile, type);
fname = LogFileName("foo", 192);
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(192, number);
ASSERT_EQ(kLogFile, type);
fname = TableFileName("bar", 200);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(200, number);
ASSERT_EQ(kTableFile, type);
fname = DescriptorFileName("bar", 100);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(100, number);
ASSERT_EQ(kDescriptorFile, type);
fname = TempFileName("tmp", 999);
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_EQ(999, number);
ASSERT_EQ(kTempFile, type);
for (int i = 0; i <= kLightweightCompression; i++) {
CompressionType ctype = static_cast<CompressionType>(i);
std::string value = "abcdef";
LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype);
fname = LargeValueFileName("tmp", real_large_ref);
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
ASSERT_TRUE(real_large_ref == large_ref);
ASSERT_EQ(kLargeValueFile, type);
ASSERT_EQ(large_ref.compression_type(), ctype);
}
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

35
db/log_format.h Normal file
View File

@@ -0,0 +1,35 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Log format information shared by reader and writer.
// See ../doc/log_format.txt for more detail.
#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
namespace leveldb {
namespace log {
enum RecordType {
// Zero is reserved for preallocated files
kZeroType = 0,
kFullType = 1,
// For fragments
kFirstType = 2,
kMiddleType = 3,
kLastType = 4,
};
static const int kMaxRecordType = kLastType;
static const int kBlockSize = 32768;
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
static const int kHeaderSize = 4 + 1 + 2;
}
}
#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_

172
db/log_reader.cc Normal file
View File

@@ -0,0 +1,172 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include <stdint.h>
#include "include/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace leveldb {
namespace log {
Reader::Reporter::~Reporter() {
}
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum)
: file_(file),
reporter_(reporter),
checksum_(checksum),
backing_store_(new char[kBlockSize]),
buffer_(),
eof_(false) {
}
Reader::~Reader() {
delete[] backing_store_;
}
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
scratch->clear();
record->clear();
bool in_fragmented_record = false;
Slice fragment;
while (true) {
switch (ReadPhysicalRecord(&fragment)) {
case kFullType:
if (in_fragmented_record) {
ReportDrop(scratch->size(), "partial record without end");
}
scratch->clear();
*record = fragment;
return true;
case kFirstType:
if (in_fragmented_record) {
ReportDrop(scratch->size(), "partial record without end");
}
scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true;
break;
case kMiddleType:
if (!in_fragmented_record) {
ReportDrop(fragment.size(), "missing start of fragmented record");
} else {
scratch->append(fragment.data(), fragment.size());
}
break;
case kLastType:
if (!in_fragmented_record) {
ReportDrop(fragment.size(), "missing start of fragmented record");
} else {
scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch);
return true;
}
break;
case kEof:
if (in_fragmented_record) {
ReportDrop(scratch->size(), "partial record without end");
scratch->clear();
}
return false;
case kBadRecord:
if (in_fragmented_record) {
ReportDrop(scratch->size(), "error in middle of record");
in_fragmented_record = false;
scratch->clear();
}
break;
default:
ReportDrop(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
"unknown record type");
in_fragmented_record = false;
scratch->clear();
break;
}
}
return false;
}
void Reader::ReportDrop(size_t bytes, const char* reason) {
if (reporter_ != NULL) {
reporter_->Corruption(bytes, Status::Corruption(reason));
}
}
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
while (true) {
if (buffer_.size() <= kHeaderSize) {
if (!eof_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
if (!status.ok()) {
if (reporter_ != NULL) {
reporter_->Corruption(kBlockSize, status);
}
buffer_.clear();
eof_ = true;
return kEof;
} else if (buffer_.size() < kBlockSize) {
eof_ = true;
}
continue;
} else if (buffer_.size() == 0) {
// End of file
return kEof;
} else if (buffer_.size() < kHeaderSize) {
ReportDrop(buffer_.size(), "truncated record at end of file");
buffer_.clear();
return kEof;
} else {
// We have a trailing zero-length record. Fall through and check it.
}
}
// Parse the header
const char* header = buffer_.data();
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
if (kHeaderSize + length > buffer_.size()) {
ReportDrop(buffer_.size(), "bad record length");
buffer_.clear();
return kBadRecord;
}
// Check crc
if (checksum_) {
if (type == kZeroType && length == 0) {
// Skip zero length record
buffer_.remove_prefix(kHeaderSize + length);
return kBadRecord;
}
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
if (actual_crc != expected_crc) {
ReportDrop(length, "checksum mismatch");
buffer_.remove_prefix(kHeaderSize + length);
return kBadRecord;
}
}
buffer_.remove_prefix(kHeaderSize + length);
*result = Slice(header + kHeaderSize, length);
return type;
}
}
}
}

75
db/log_reader.h Normal file
View File

@@ -0,0 +1,75 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
#define STORAGE_LEVELDB_DB_LOG_READER_H_
#include "db/log_format.h"
#include "include/slice.h"
#include "include/status.h"
namespace leveldb {
class SequentialFile;
namespace log {
class Reader {
public:
// Interface for reporting errors.
class Reporter {
public:
virtual ~Reporter();
// Some corruption was detected. "size" is the approximate number
// of bytes dropped due to the corruption.
virtual void Corruption(size_t bytes, const Status& status) = 0;
};
// Create a reader that will return log records from "*file".
// "*file" must remain live while this Reader is in use.
//
// If "reporter" is non-NULL, it is notified whenever some data is
// dropped due to a detected corruption. "*reporter" must remain
// live while this Reader is in use.
//
// If "checksum" is true, verify checksums if available.
Reader(SequentialFile* file, Reporter* reporter, bool checksum);
~Reader();
// Read the next record into *record. Returns true if read
// successfully, false if we hit end of the input. May use
// "*scratch" as temporary storage. The contents filled in *record
// will only be valid until the next mutating operation on this
// reader or the next mutation to *scratch.
bool ReadRecord(Slice* record, std::string* scratch);
private:
SequentialFile* const file_;
Reporter* const reporter_;
bool const checksum_;
char* const backing_store_;
Slice buffer_;
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
// Extend record types with the following special values
enum {
kEof = kMaxRecordType + 1,
kBadRecord = kMaxRecordType + 2
};
// Return type, or one of the preceding special values
unsigned int ReadPhysicalRecord(Slice* result);
void ReportDrop(size_t bytes, const char* reason);
// No copying allowed
Reader(const Reader&);
void operator=(const Reader&);
};
}
}
#endif // STORAGE_LEVELDB_DB_LOG_READER_H_

361
db/log_test.cc Normal file
View File

@@ -0,0 +1,361 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "include/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/random.h"
#include "util/testharness.h"
namespace leveldb {
namespace log {
// Construct a string of the specified length made out of the supplied
// partial string.
static std::string BigString(const std::string& partial_string, size_t n) {
std::string result;
while (result.size() < n) {
result.append(partial_string);
}
result.resize(n);
return result;
}
// Construct a string from a number
static std::string NumberString(int n) {
char buf[50];
snprintf(buf, sizeof(buf), "%d.", n);
return std::string(buf);
}
// Return a skewed potentially long string
static std::string RandomSkewedString(int i, Random* rnd) {
return BigString(NumberString(i), rnd->Skewed(17));
}
class LogTest {
private:
class StringDest : public WritableFile {
public:
std::string contents_;
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
virtual Status Append(const Slice& slice) {
contents_.append(slice.data(), slice.size());
return Status::OK();
}
};
class StringSource : public SequentialFile {
public:
Slice contents_;
bool force_error_;
bool returned_partial_;
StringSource() : force_error_(false), returned_partial_(false) { }
virtual Status Read(size_t n, Slice* result, char* scratch) {
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
ASSERT_EQ(kBlockSize, n);
if (force_error_) {
force_error_ = false;
returned_partial_ = true;
return Status::Corruption("read error");
}
if (contents_.size() < n) {
n = contents_.size();
returned_partial_ = true;
}
*result = Slice(contents_.data(), n);
contents_.remove_prefix(n);
return Status::OK();
}
};
class ReportCollector : public Reader::Reporter {
public:
size_t dropped_bytes_;
std::string message_;
ReportCollector() : dropped_bytes_(0) { }
virtual void Corruption(size_t bytes, const Status& status) {
dropped_bytes_ += bytes;
message_.append(status.ToString());
}
};
StringDest dest_;
StringSource source_;
ReportCollector report_;
bool reading_;
Writer writer_;
Reader reader_;
public:
LogTest() : reading_(false),
writer_(&dest_),
reader_(&source_, &report_, true/*checksum*/) {
}
void Write(const std::string& msg) {
ASSERT_TRUE(!reading_) << "Write() after starting to read";
writer_.AddRecord(Slice(msg));
}
size_t WrittenBytes() const {
return dest_.contents_.size();
}
std::string Read() {
if (!reading_) {
reading_ = true;
source_.contents_ = Slice(dest_.contents_);
}
std::string scratch;
Slice record;
if (reader_.ReadRecord(&record, &scratch)) {
return record.ToString();
} else {
return "EOF";
}
}
void IncrementByte(int offset, int delta) {
dest_.contents_[offset] += delta;
}
void SetByte(int offset, char new_byte) {
dest_.contents_[offset] = new_byte;
}
void ShrinkSize(int bytes) {
dest_.contents_.resize(dest_.contents_.size() - bytes);
}
void FixChecksum(int header_offset, int len) {
// Compute crc of type/len/data
uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len);
crc = crc32c::Mask(crc);
EncodeFixed32(&dest_.contents_[header_offset], crc);
}
void ForceError() {
source_.force_error_ = true;
}
size_t DroppedBytes() const {
return report_.dropped_bytes_;
}
// Returns OK iff recorded error message contains "msg"
std::string MatchError(const std::string& msg) const {
if (report_.message_.find(msg) == std::string::npos) {
return report_.message_;
} else {
return "OK";
}
}
};
TEST(LogTest, Empty) {
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, ReadWrite) {
Write("foo");
Write("bar");
Write("");
Write("xxxx");
ASSERT_EQ("foo", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("", Read());
ASSERT_EQ("xxxx", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
}
TEST(LogTest, ManyBlocks) {
for (int i = 0; i < 100000; i++) {
Write(NumberString(i));
}
for (int i = 0; i < 100000; i++) {
ASSERT_EQ(NumberString(i), Read());
}
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, Fragmentation) {
Write("small");
Write(BigString("medium", 50000));
Write(BigString("large", 100000));
ASSERT_EQ("small", Read());
ASSERT_EQ(BigString("medium", 50000), Read());
ASSERT_EQ(BigString("large", 100000), Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, MarginalTrailer) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, ShortTrailer) {
const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, AlignedEof) {
const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, RandomRead) {
const int N = 500;
Random write_rnd(301);
for (int i = 0; i < N; i++) {
Write(RandomSkewedString(i, &write_rnd));
}
Random read_rnd(301);
for (int i = 0; i < N; i++) {
ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
}
ASSERT_EQ("EOF", Read());
}
// Tests of all the error paths in log_reader.cc follow:
TEST(LogTest, ReadError) {
Write("foo");
ForceError();
ASSERT_EQ("EOF", Read());
ASSERT_EQ(kBlockSize, DroppedBytes());
ASSERT_EQ("OK", MatchError("read error"));
}
TEST(LogTest, BadRecordType) {
Write("foo");
// Type is stored in header[6]
IncrementByte(6, 100);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("unknown record type"));
}
TEST(LogTest, TruncatedTrailingRecord) {
Write("foo");
ShrinkSize(4); // Drop all payload as well as a header byte
ASSERT_EQ("EOF", Read());
ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
ASSERT_EQ("OK", MatchError("truncated record at end of file"));
}
TEST(LogTest, BadLength) {
Write("foo");
ShrinkSize(1);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
ASSERT_EQ("OK", MatchError("bad record length"));
}
TEST(LogTest, ChecksumMismatch) {
Write("foo");
IncrementByte(0, 10);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("checksum mismatch"));
}
TEST(LogTest, UnexpectedMiddleType) {
Write("foo");
SetByte(6, kMiddleType);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST(LogTest, UnexpectedLastType) {
Write("foo");
SetByte(6, kLastType);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST(LogTest, UnexpectedFullType) {
Write("foo");
Write("bar");
SetByte(6, kFirstType);
FixChecksum(0, 3);
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST(LogTest, UnexpectedFirstType) {
Write("foo");
Write(BigString("bar", 100000));
SetByte(6, kFirstType);
FixChecksum(0, 3);
ASSERT_EQ(BigString("bar", 100000), Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST(LogTest, ErrorJoinsRecords) {
// Consider two fragmented records:
// first(R1) last(R1) first(R2) last(R2)
// where the middle two fragments disappear. We do not want
// first(R1),last(R2) to get joined and returned as a valid record.
// Write records that span two blocks
Write(BigString("foo", kBlockSize));
Write(BigString("bar", kBlockSize));
Write("correct");
// Wipe the middle block
for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
SetByte(offset, 'x');
}
ASSERT_EQ("correct", Read());
ASSERT_EQ("EOF", Read());
const int dropped = DroppedBytes();
ASSERT_LE(dropped, 2*kBlockSize + 100);
ASSERT_GE(dropped, 2*kBlockSize);
}
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

101
db/log_writer.cc Normal file
View File

@@ -0,0 +1,101 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_writer.h"
#include <stdint.h>
#include "include/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace leveldb {
namespace log {
Writer::Writer(WritableFile* dest)
: dest_(dest),
block_offset_(0) {
for (int i = 0; i <= kMaxRecordType; i++) {
char t = static_cast<char>(i);
type_crc_[i] = crc32c::Value(&t, 1);
}
}
Writer::~Writer() {
}
Status Writer::AddRecord(const Slice& slice) {
const char* ptr = slice.data();
size_t left = slice.size();
// Fragment the record if necessary and emit it. Note that if slice
// is empty, we still want to iterate once to emit a single
// zero-length record
Status s;
do {
const int leftover = kBlockSize - block_offset_;
assert(leftover >= 0);
if (leftover <= kHeaderSize) {
// Switch to a new block
if (leftover > 0) {
// Fill the trailer
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00", leftover));
}
block_offset_ = 0;
}
// Invariant: we never leave <= kHeaderSize bytes in a block.
const int avail = kBlockSize - block_offset_ - kHeaderSize;
assert(avail > 0);
const size_t fragment_length = (left < avail) ? left : avail;
RecordType type;
const bool begin = (ptr == slice.data());
const bool end = (left == fragment_length);
if (begin && end) {
type = kFullType;
} else if (begin) {
type = kFirstType;
} else if (end) {
type = kLastType;
} else {
type = kMiddleType;
}
s = EmitPhysicalRecord(type, ptr, fragment_length);
ptr += fragment_length;
left -= fragment_length;
} while (s.ok() && left > 0);
return s;
}
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
assert(n <= 0xffff); // Must fit in two bytes
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
// Format the header
char buf[kHeaderSize];
buf[4] = static_cast<char>(n & 0xff);
buf[5] = static_cast<char>(n >> 8);
buf[6] = static_cast<char>(t);
// Compute the crc of the record type and the payload.
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
crc = crc32c::Mask(crc); // Adjust for storage
EncodeFixed32(buf, crc);
// Write the header and the payload
Status s = dest_->Append(Slice(buf, kHeaderSize));
if (s.ok()) {
s = dest_->Append(Slice(ptr, n));
if (s.ok()) {
s = dest_->Flush();
}
}
block_offset_ += kHeaderSize + n;
return s;
}
}
}

48
db/log_writer.h Normal file
View File

@@ -0,0 +1,48 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_
#define STORAGE_LEVELDB_DB_LOG_WRITER_H_
#include <stdint.h>
#include "db/log_format.h"
#include "include/slice.h"
#include "include/status.h"
namespace leveldb {
class WritableFile;
namespace log {
class Writer {
public:
// Create a writer that will append data to "*dest".
// "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use.
explicit Writer(WritableFile* dest);
~Writer();
Status AddRecord(const Slice& slice);
private:
WritableFile* dest_;
int block_offset_; // Current offset in block
// crc32c values for all supported record types. These are
// pre-computed to reduce the overhead of computing the crc of the
// record type stored in the header.
uint32_t type_crc_[kMaxRecordType + 1];
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
// No copying allowed
Writer(const Writer&);
void operator=(const Writer&);
};
}
}
#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_

109
db/memtable.cc Normal file
View File

@@ -0,0 +1,109 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/memtable.h"
#include "db/dbformat.h"
#include "include/comparator.h"
#include "include/env.h"
#include "include/iterator.h"
#include "util/coding.h"
namespace leveldb {
static Slice GetLengthPrefixedSlice(const char* data) {
uint32_t len;
const char* p = data;
p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted
return Slice(p, len);
}
MemTable::MemTable(const InternalKeyComparator& cmp)
: comparator_(cmp),
table_(comparator_, &arena_) {
}
MemTable::~MemTable() {
}
size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
const {
// Internal keys are encoded as length-prefixed strings.
Slice a = GetLengthPrefixedSlice(aptr);
Slice b = GetLengthPrefixedSlice(bptr);
return comparator.Compare(a, b);
}
// Encode a suitable internal key target for "target" and return it.
// Uses *scratch as scratch space, and the returned pointer will point
// into this scratch space.
static const char* EncodeKey(std::string* scratch, const Slice& target) {
scratch->clear();
PutVarint32(scratch, target.size());
scratch->append(target.data(), target.size());
return scratch->data();
}
class MemTableIterator: public Iterator {
public:
explicit MemTableIterator(MemTable::Table* table) {
iter_ = new MemTable::Table::Iterator(table);
}
virtual ~MemTableIterator() { delete iter_; }
virtual bool Valid() const { return iter_->Valid(); }
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
virtual void SeekToFirst() { iter_->SeekToFirst(); }
virtual void SeekToLast() { iter_->SeekToLast(); }
virtual void Next() { iter_->Next(); }
virtual void Prev() { iter_->Prev(); }
virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); }
virtual Slice value() const {
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
}
virtual Status status() const { return Status::OK(); }
private:
MemTable::Table::Iterator* iter_;
std::string tmp_; // For passing to EncodeKey
// No copying allowed
MemTableIterator(const MemTableIterator&);
void operator=(const MemTableIterator&);
};
Iterator* MemTable::NewIterator() {
return new MemTableIterator(&table_);
}
void MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key,
const Slice& value) {
// Format of an entry is concatenation of:
// key_size : varint32 of internal_key.size()
// key bytes : char[internal_key.size()]
// value_size : varint32 of value.size()
// value bytes : char[value.size()]
size_t key_size = key.size();
size_t val_size = value.size();
size_t internal_key_size = key_size + 8;
const size_t encoded_len =
VarintLength(internal_key_size) + internal_key_size +
VarintLength(val_size) + val_size;
char* buf = arena_.Allocate(encoded_len);
char* p = EncodeVarint32(buf, internal_key_size);
memcpy(p, key.data(), key_size);
p += key_size;
EncodeFixed64(p, (s << 8) | type);
p += 8;
p = EncodeVarint32(p, val_size);
memcpy(p, value.data(), val_size);
assert((p + val_size) - buf == encoded_len);
table_.Insert(buf);
}
}

69
db/memtable.h Normal file
View File

@@ -0,0 +1,69 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_
#define STORAGE_LEVELDB_DB_MEMTABLE_H_
#include <string>
#include "include/db.h"
#include "db/dbformat.h"
#include "db/skiplist.h"
#include "util/arena.h"
namespace leveldb {
class InternalKeyComparator;
class Mutex;
class MemTableIterator;
class MemTable {
public:
explicit MemTable(const InternalKeyComparator& comparator);
~MemTable();
// Returns an estimate of the number of bytes of data in use by this
// data structure.
//
// REQUIRES: external synchronization to prevent simultaneous
// operations on the same MemTable.
size_t ApproximateMemoryUsage();
// Return an iterator that yields the contents of the memtable.
//
// The caller must ensure that the underlying MemTable remains live
// while the returned iterator is live. The keys returned by this
// iterator are internal keys encoded by AppendInternalKey in the
// db/format.{h,cc} module.
Iterator* NewIterator();
// Add an entry into memtable that maps key to value at the
// specified sequence number and with the specified type.
// Typically value will be empty if type==kTypeDeletion.
void Add(SequenceNumber seq, ValueType type,
const Slice& key,
const Slice& value);
private:
struct KeyComparator {
const InternalKeyComparator comparator;
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
int operator()(const char* a, const char* b) const;
};
friend class MemTableIterator;
friend class MemTableBackwardIterator;
typedef SkipList<const char*, KeyComparator> Table;
KeyComparator comparator_;
Arena arena_;
Table table_;
// No copying allowed
MemTable(const MemTable&);
void operator=(const MemTable&);
};
}
#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_

396
db/repair.cc Normal file
View File

@@ -0,0 +1,396 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// We recover the contents of the descriptor from the other files we find.
// (1) Any log files are first converted to tables
// (2) We scan every table to compute
// (a) smallest/largest for the table
// (b) large value refs from the table
// (c) largest sequence number in the table
// (3) We generate descriptor contents:
// - log number is set to zero
// - next-file-number is set to 1 + largest file number we found
// - last-sequence-number is set to largest sequence# found across
// all tables (see 2c)
// - compaction pointers are cleared
// - every table file is added at level 0
//
// Possible optimization 1:
// (a) Compute total size and use to pick appropriate max-level M
// (b) Sort tables by largest sequence# in the table
// (c) For each table: if it overlaps earlier table, place in level-0,
// else place in level-M.
// Possible optimization 2:
// Store per-table metadata (smallest, largest, largest-seq#,
// large-value-refs, ...) in the table's meta section to speed up
// ScanTable.
#include "db/builder.h"
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "db/write_batch_internal.h"
#include "include/comparator.h"
#include "include/db.h"
#include "include/env.h"
namespace leveldb {
namespace {
class Repairer {
public:
Repairer(const std::string& dbname, const Options& options)
: dbname_(dbname),
env_(options.env),
icmp_(options.comparator),
options_(SanitizeOptions(dbname, &icmp_, options)),
owns_info_log_(options_.info_log != options.info_log),
next_file_number_(1) {
// TableCache can be small since we expect each table to be opened once.
table_cache_ = new TableCache(dbname_, &options_, 10);
}
~Repairer() {
delete table_cache_;
if (owns_info_log_) {
delete options_.info_log;
}
}
Status Run() {
Status status = FindFiles();
if (status.ok()) {
ConvertLogFilesToTables();
ExtractMetaData();
status = WriteDescriptor();
}
if (status.ok()) {
unsigned long long bytes = 0;
for (int i = 0; i < tables_.size(); i++) {
bytes += tables_[i].meta.file_size;
}
Log(env_, options_.info_log,
"**** Repaired leveldb %s; "
"recovered %d files; %llu bytes. "
"Some data may have been lost. "
"****",
dbname_.c_str(),
static_cast<int>(tables_.size()),
bytes);
}
return status;
}
private:
struct TableInfo {
FileMetaData meta;
SequenceNumber max_sequence;
};
std::string const dbname_;
Env* const env_;
InternalKeyComparator const icmp_;
Options const options_;
bool owns_info_log_;
TableCache* table_cache_;
VersionEdit edit_;
std::vector<std::string> manifests_;
std::vector<uint64_t> table_numbers_;
std::vector<uint64_t> logs_;
std::vector<TableInfo> tables_;
uint64_t next_file_number_;
Status FindFiles() {
std::vector<std::string> filenames;
Status status = env_->GetChildren(dbname_, &filenames);
if (!status.ok()) {
return status;
}
if (filenames.empty()) {
return Status::IOError(dbname_, "repair found no files");
}
uint64_t number;
LargeValueRef large_ref;
FileType type;
for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &large_ref, &type)) {
if (type == kLargeValueFile) {
// Will be picked up when we process a Table that points to it
} else if (type == kDescriptorFile) {
manifests_.push_back(filenames[i]);
} else {
if (number + 1 > next_file_number_) {
next_file_number_ = number + 1;
}
if (type == kLogFile) {
logs_.push_back(number);
} else if (type == kTableFile) {
table_numbers_.push_back(number);
} else {
// Ignore other files
}
}
}
}
return status;
}
void ConvertLogFilesToTables() {
for (int i = 0; i < logs_.size(); i++) {
std::string logname = LogFileName(dbname_, logs_[i]);
Status status = ConvertLogToTable(logs_[i]);
if (!status.ok()) {
Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s",
(unsigned long long) logs_[i],
status.ToString().c_str());
}
ArchiveFile(logname);
}
}
Status ConvertLogToTable(uint64_t log) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
WritableFile* info_log;
uint64_t lognum;
virtual void Corruption(size_t bytes, const Status& s) {
// We print error messages for corruption, but continue repairing.
Log(env, info_log, "Log #%llu: dropping %d bytes; %s",
(unsigned long long) lognum,
static_cast<int>(bytes),
s.ToString().c_str());
}
};
// Open the log file
std::string logname = LogFileName(dbname_, log);
SequentialFile* lfile;
Status status = env_->NewSequentialFile(logname, &lfile);
if (!status.ok()) {
return status;
}
// Create the log reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.lognum = log;
// We intentially make log::Reader do checksumming so that
// corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence
// numbers).
log::Reader reader(lfile, &reporter, false/*do not checksum*/);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
MemTable mem(icmp_);
int counter = 0;
while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < 12) {
reporter.Corruption(
record.size(), Status::Corruption("log record too small"));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, &mem);
if (status.ok()) {
counter += WriteBatchInternal::Count(&batch);
} else {
Log(env_, options_.info_log, "Log #%llu: ignoring %s",
(unsigned long long) log,
status.ToString().c_str());
status = Status::OK(); // Keep going with rest of file
}
}
delete lfile;
// We ignore any version edits generated by the conversion to a Table
// since ExtractMetaData() will also generate edits.
VersionEdit skipped;
FileMetaData meta;
meta.number = next_file_number_++;
Iterator* iter = mem.NewIterator();
status = BuildTable(dbname_, env_, options_, table_cache_, iter,
&meta, &skipped);
delete iter;
if (status.ok()) {
if (meta.file_size > 0) {
table_numbers_.push_back(meta.number);
}
}
Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
(unsigned long long) log,
counter,
(unsigned long long) meta.number,
status.ToString().c_str());
return status;
}
void ExtractMetaData() {
std::vector<TableInfo> kept;
for (int i = 0; i < table_numbers_.size(); i++) {
TableInfo t;
t.meta.number = table_numbers_[i];
Status status = ScanTable(&t);
if (!status.ok()) {
std::string fname = TableFileName(dbname_, table_numbers_[i]);
Log(env_, options_.info_log, "Table #%llu: ignoring %s",
(unsigned long long) table_numbers_[i],
status.ToString().c_str());
ArchiveFile(fname);
} else {
tables_.push_back(t);
}
}
}
Status ScanTable(TableInfo* t) {
std::string fname = TableFileName(dbname_, t->meta.number);
int counter = 0;
Status status = env_->GetFileSize(fname, &t->meta.file_size);
if (status.ok()) {
Iterator* iter = table_cache_->NewIterator(
ReadOptions(), t->meta.number);
bool empty = true;
ParsedInternalKey parsed;
t->max_sequence = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) {
Log(env_, options_.info_log, "Table #%llu: unparsable key %s",
(unsigned long long) t->meta.number,
EscapeString(key).c_str());
continue;
}
counter++;
if (empty) {
empty = false;
t->meta.smallest.DecodeFrom(key);
}
t->meta.largest.DecodeFrom(key);
if (parsed.sequence > t->max_sequence) {
t->max_sequence = parsed.sequence;
}
if (ExtractValueType(key) == kTypeLargeValueRef) {
if (iter->value().size() != LargeValueRef::ByteSize()) {
Log(env_, options_.info_log, "Table #%llu: bad large value ref",
(unsigned long long) t->meta.number);
} else {
edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
t->meta.number,
key);
}
}
}
if (!iter->status().ok()) {
status = iter->status();
}
delete iter;
}
Log(env_, options_.info_log, "Table #%llu: %d entries %s",
(unsigned long long) t->meta.number,
counter,
status.ToString().c_str());
return status;
}
Status WriteDescriptor() {
std::string tmp = TempFileName(dbname_, 1);
WritableFile* file;
Status status = env_->NewWritableFile(tmp, &file);
if (!status.ok()) {
return status;
}
SequenceNumber max_sequence = 0;
for (int i = 0; i < tables_.size(); i++) {
if (max_sequence < tables_[i].max_sequence) {
max_sequence = tables_[i].max_sequence;
}
}
edit_.SetComparatorName(icmp_.user_comparator()->Name());
edit_.SetLogNumber(0);
edit_.SetNextFile(next_file_number_);
edit_.SetLastSequence(max_sequence);
for (int i = 0; i < tables_.size(); i++) {
// TODO(opt): separate out into multiple levels
const TableInfo& t = tables_[i];
edit_.AddFile(0, t.meta.number, t.meta.file_size,
t.meta.smallest, t.meta.largest);
}
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
{
log::Writer log(file);
std::string record;
edit_.EncodeTo(&record);
status = log.AddRecord(record);
}
if (status.ok()) {
status = file->Close();
}
delete file;
file = NULL;
if (!status.ok()) {
env_->DeleteFile(tmp);
} else {
// Discard older manifests
for (int i = 0; i < manifests_.size(); i++) {
ArchiveFile(dbname_ + "/" + manifests_[i]);
}
// Install new manifest
status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
if (status.ok()) {
status = SetCurrentFile(env_, dbname_, 1);
} else {
env_->DeleteFile(tmp);
}
}
return status;
}
void ArchiveFile(const std::string& fname) {
// Move into another directory. E.g., for
// dir/foo
// rename to
// dir/lost/foo
const char* slash = strrchr(fname.c_str(), '/');
std::string new_dir;
if (slash != NULL) {
new_dir.assign(fname.data(), slash - fname.data());
}
new_dir.append("/lost");
env_->CreateDir(new_dir); // Ignore error
std::string new_file = new_dir;
new_file.append("/");
new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
Status s = env_->RenameFile(fname, new_file);
Log(env_, options_.info_log, "Archiving %s: %s\n",
fname.c_str(), s.ToString().c_str());
}
};
}
Status RepairDB(const std::string& dbname, const Options& options) {
Repairer repairer(dbname, options);
return repairer.Run();
}
}

378
db/skiplist.h Normal file
View File

@@ -0,0 +1,378 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread safety
// -------------
//
// Writes require external synchronization, most likely a mutex.
// Reads require a guarantee that the SkipList will not be destroyed
// while the read is in progress. Apart from that, reads progress
// without any internal locking or synchronization.
//
// Invariants:
//
// (1) Allocated nodes are never deleted until the SkipList is
// destroyed. This is trivially guaranteed by the code since we
// never delete any skip list nodes.
//
// (2) The contents of a Node except for the next/prev pointers are
// immutable after the Node has been linked into the SkipList.
// Only Insert() modifies the list, and it is careful to initialize
// a node and use release-stores to publish the nodes in one or
// more lists.
//
// ... prev vs. next pointer ordering ...
#include <assert.h>
#include <stdlib.h>
#include "port/port.h"
#include "util/arena.h"
#include "util/random.h"
namespace leveldb {
class Arena;
template<typename Key, class Comparator>
class SkipList {
private:
struct Node;
public:
// Create a new SkipList object that will use "cmp" for comparing keys,
// and will allocate memory using "*arena". Objects allocated in the arena
// must remain allocated for the lifetime of the skiplist object.
explicit SkipList(Comparator cmp, Arena* arena);
// Insert key into the list.
// REQUIRES: nothing that compares equal to key is currently in the list.
void Insert(const Key& key);
// Returns true iff an entry that compares equal to key is in the list.
bool Contains(const Key& key) const;
// Iteration over the contents of a skip list
class Iterator {
public:
// Initialize an iterator over the specified list.
// The returned iterator is not valid.
explicit Iterator(const SkipList* list);
// Returns true iff the iterator is positioned at a valid node.
bool Valid() const;
// Returns the key at the current position.
// REQUIRES: Valid()
const Key& key() const;
// Advances to the next position.
// REQUIRES: Valid()
void Next();
// Advances to the previous position.
// REQUIRES: Valid()
void Prev();
// Advance to the first entry with a key >= target
void Seek(const Key& target);
// Position at the first entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToFirst();
// Position at the last entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToLast();
private:
const SkipList* list_;
Node* node_;
// Intentionally copyable
};
private:
enum { kMaxHeight = 12 };
// Immutable after construction
Comparator const compare_;
Arena* const arena_; // Arena used for allocations of nodes
Node* const head_;
// Modified only by Insert(). Read racily by readers, but stale
// values are ok.
port::AtomicPointer max_height_; // Height of the entire list
inline int GetMaxHeight() const {
return reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load());
}
// Read/written only by Insert().
Random rnd_;
Node* NewNode(const Key& key, int height);
int RandomHeight();
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
// Return true if key is greater than the data stored in "n"
bool KeyIsAfterNode(const Key& key, Node* n) const;
// Return the earliest node that comes at or after key.
// Return NULL if there is no such node.
//
// If prev is non-NULL, fills prev[level] with pointer to previous
// node at "level" for every level in [0..max_height_-1].
Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
// Return the latest node with a key < key.
// Return head_ if there is no such node.
Node* FindLessThan(const Key& key) const;
// Return the last node in the list.
// Return head_ if list is empty.
Node* FindLast() const;
// No copying allowed
SkipList(const SkipList&);
void operator=(const SkipList&);
};
// Implementation details follow
template<typename Key, class Comparator>
struct SkipList<Key,Comparator>::Node {
explicit Node(const Key& k) : key(k) { }
Key const key;
// Accessors/mutators for links. Wrapped in methods so we can
// add the appropriate barriers as necessary.
Node* Next(int n) {
assert(n >= 0);
// Use an 'acquire load' so that we observe a fully initialized
// version of the returned Node.
return reinterpret_cast<Node*>(next_[n].Acquire_Load());
}
void SetNext(int n, Node* x) {
assert(n >= 0);
// Use a 'release store' so that anybody who reads through this
// pointer observes a fully initialized version of the inserted node.
next_[n].Release_Store(x);
}
// No-barrier variants that can be safely used in a few locations.
Node* NoBarrier_Next(int n) {
assert(n >= 0);
return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
}
void NoBarrier_SetNext(int n, Node* x) {
assert(n >= 0);
next_[n].NoBarrier_Store(x);
}
private:
// Array of length equal to the node height. next_[0] is lowest level link.
port::AtomicPointer next_[1];
};
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node*
SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
char* mem = arena_->AllocateAligned(
sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
return new (mem) Node(key);
}
template<typename Key, class Comparator>
inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
list_ = list;
node_ = NULL;
}
template<typename Key, class Comparator>
inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
return node_ != NULL;
}
template<typename Key, class Comparator>
inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
assert(Valid());
return node_->key;
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Next() {
assert(Valid());
node_ = node_->Next(0);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Prev() {
// Instead of using explicit "prev" links, we just search for the
// last node that falls before key.
assert(Valid());
node_ = list_->FindLessThan(node_->key);
if (node_ == list_->head_) {
node_ = NULL;
}
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
node_ = list_->FindGreaterOrEqual(target, NULL);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
node_ = list_->head_->Next(0);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
node_ = list_->FindLast();
if (node_ == list_->head_) {
node_ = NULL;
}
}
template<typename Key, class Comparator>
int SkipList<Key,Comparator>::RandomHeight() {
// Increase height with probability 1 in kBranching
static const unsigned int kBranching = 4;
int height = 1;
while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
height++;
}
assert(height > 0);
assert(height <= kMaxHeight);
return height;
}
template<typename Key, class Comparator>
bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
// NULL n is considered infinite
return (n != NULL) && (compare_(n->key, key) < 0);
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
if (KeyIsAfterNode(key, next)) {
// Keep searching in this list
x = next;
} else {
if (prev != NULL) prev[level] = x;
if (level == 0) {
return next;
} else {
// Switch to next list
level--;
}
}
}
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node*
SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
assert(x == head_ || compare_(x->key, key) < 0);
Node* next = x->Next(level);
if (next == NULL || compare_(next->key, key) >= 0) {
if (level == 0) {
return x;
} else {
// Switch to next list
level--;
}
} else {
x = next;
}
}
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
if (next == NULL) {
if (level == 0) {
return x;
} else {
// Switch to next list
level--;
}
} else {
x = next;
}
}
}
template<typename Key, class Comparator>
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
: compare_(cmp),
arena_(arena),
head_(NewNode(0 /* any key will do */, kMaxHeight)),
max_height_(reinterpret_cast<void*>(1)),
rnd_(0xdeadbeef) {
for (int i = 0; i < kMaxHeight; i++) {
head_->SetNext(i, NULL);
}
}
template<typename Key, class Comparator>
void SkipList<Key,Comparator>::Insert(const Key& key) {
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
// here since Insert() is externally synchronized.
Node* prev[kMaxHeight];
Node* x = FindGreaterOrEqual(key, prev);
// Our data structure does not allow duplicate insertion
assert(x == NULL || !Equal(key, x->key));
int height = RandomHeight();
if (height > GetMaxHeight()) {
for (int i = GetMaxHeight(); i < height; i++) {
prev[i] = head_;
}
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
// It is ok to mutate max_height_ without any synchronization
// with concurrent readers. A concurrent reader that observes
// the new value of max_height_ will see either the old value of
// new level pointers from head_ (NULL), or a new value set in
// the loop below. In the former case the reader will
// immediately drop to the next level since NULL sorts after all
// keys. In the latter case the reader will use the new node.
max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
}
x = NewNode(key, height);
for (int i = 0; i < height; i++) {
// NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i].
x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
prev[i]->SetNext(i, x);
}
}
template<typename Key, class Comparator>
bool SkipList<Key,Comparator>::Contains(const Key& key) const {
Node* x = FindGreaterOrEqual(key, NULL);
if (x != NULL && Equal(key, x->key)) {
return true;
} else {
return false;
}
}
}

378
db/skiplist_test.cc Normal file
View File

@@ -0,0 +1,378 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/skiplist.h"
#include <set>
#include "include/env.h"
#include "util/arena.h"
#include "util/hash.h"
#include "util/random.h"
#include "util/testharness.h"
namespace leveldb {
typedef uint64_t Key;
struct Comparator {
int operator()(const Key& a, const Key& b) const {
if (a < b) {
return -1;
} else if (a > b) {
return +1;
} else {
return 0;
}
}
};
class SkipTest { };
TEST(SkipTest, Empty) {
Arena arena;
Comparator cmp;
SkipList<Key, Comparator> list(cmp, &arena);
ASSERT_TRUE(!list.Contains(10));
SkipList<Key, Comparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid());
iter.SeekToFirst();
ASSERT_TRUE(!iter.Valid());
iter.Seek(100);
ASSERT_TRUE(!iter.Valid());
iter.SeekToLast();
ASSERT_TRUE(!iter.Valid());
}
TEST(SkipTest, InsertAndLookup) {
const int N = 2000;
const int R = 5000;
Random rnd(1000);
std::set<Key> keys;
Arena arena;
Comparator cmp;
SkipList<Key, Comparator> list(cmp, &arena);
for (int i = 0; i < N; i++) {
Key key = rnd.Next() % R;
if (keys.insert(key).second) {
list.Insert(key);
}
}
for (int i = 0; i < R; i++) {
if (list.Contains(i)) {
ASSERT_EQ(keys.count(i), 1);
} else {
ASSERT_EQ(keys.count(i), 0);
}
}
// Simple iterator tests
{
SkipList<Key, Comparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid());
iter.Seek(0);
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.begin()), iter.key());
iter.SeekToFirst();
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.begin()), iter.key());
iter.SeekToLast();
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.rbegin()), iter.key());
}
// Forward iteration test
for (int i = 0; i < R; i++) {
SkipList<Key, Comparator>::Iterator iter(&list);
iter.Seek(i);
// Compare against model iterator
std::set<Key>::iterator model_iter = keys.lower_bound(i);
for (int j = 0; j < 3; j++) {
if (model_iter == keys.end()) {
ASSERT_TRUE(!iter.Valid());
break;
} else {
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*model_iter, iter.key());
++model_iter;
iter.Next();
}
}
}
// Backward iteration test
{
SkipList<Key, Comparator>::Iterator iter(&list);
iter.SeekToLast();
// Compare against model iterator
for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
model_iter != keys.rend();
++model_iter) {
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*model_iter, iter.key());
iter.Prev();
}
ASSERT_TRUE(!iter.Valid());
}
}
// We want to make sure that with a single writer and multiple
// concurrent readers (with no synchronization other than when a
// reader's iterator is created), the reader always observes all the
// data that was present in the skip list when the iterator was
// constructor. Because insertions are happening concurrently, we may
// also observe new values that were inserted since the iterator was
// constructed, but we should never miss any values that were present
// at iterator construction time.
//
// We generate multi-part keys:
// <key,gen,hash>
// where:
// key is in range [0..K-1]
// gen is a generation number for key
// hash is hash(key,gen)
//
// The insertion code picks a random key, sets gen to be 1 + the last
// generation number inserted for that key, and sets hash to Hash(key,gen).
//
// At the beginning of a read, we snapshot the last inserted
// generation number for each key. We then iterate, including random
// calls to Next() and Seek(). For every key we encounter, we
// check that it is either expected given the initial snapshot or has
// been concurrently added since the iterator started.
class ConcurrentTest {
private:
static const uint32_t K = 4;
static uint64_t key(Key key) { return (key >> 40); }
static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
static uint64_t hash(Key key) { return key & 0xff; }
static uint64_t HashNumbers(uint64_t k, uint64_t g) {
uint64_t data[2] = { k, g };
return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
}
static Key MakeKey(uint64_t k, uint64_t g) {
assert(sizeof(Key) == sizeof(uint64_t));
assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
assert(g <= 0xffffffffu);
return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
}
static bool IsValidKey(Key k) {
return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
}
static Key RandomTarget(Random* rnd) {
switch (rnd->Next() % 10) {
case 0:
// Seek to beginning
return MakeKey(0, 0);
case 1:
// Seek to end
return MakeKey(K, 0);
default:
// Seek to middle
return MakeKey(rnd->Next() % K, 0);
}
}
// Per-key generation
struct State {
port::AtomicPointer generation[K];
void Set(int k, intptr_t v) {
generation[k].Release_Store(reinterpret_cast<void*>(v));
}
intptr_t Get(int k) {
return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
}
State() {
for (int k = 0; k < K; k++) {
Set(k, 0);
}
}
};
// Current state of the test
State current_;
Arena arena_;
// SkipList is not protected by mu_. We just use a single writer
// thread to modify it.
SkipList<Key, Comparator> list_;
public:
ConcurrentTest() : list_(Comparator(), &arena_) { }
// REQUIRES: External synchronization
void WriteStep(Random* rnd) {
const uint32_t k = rnd->Next() % K;
const intptr_t g = current_.Get(k) + 1;
const Key key = MakeKey(k, g);
list_.Insert(key);
current_.Set(k, g);
}
void ReadStep(Random* rnd) {
// Remember the initial committed state of the skiplist.
State initial_state;
for (int k = 0; k < K; k++) {
initial_state.Set(k, current_.Get(k));
}
Key pos = RandomTarget(rnd);
SkipList<Key, Comparator>::Iterator iter(&list_);
iter.Seek(pos);
while (true) {
Key current;
if (!iter.Valid()) {
current = MakeKey(K, 0);
} else {
current = iter.key();
ASSERT_TRUE(IsValidKey(current)) << std::hex << current;
}
ASSERT_LE(pos, current) << "should not go backwards";
// Verify that everything in [pos,current) was not present in
// initial_state.
while (pos < current) {
ASSERT_LT(key(pos), K) << std::hex << pos;
// Note that generation 0 is never inserted, so it is ok if
// <*,0,*> is missing.
ASSERT_TRUE((gen(pos) == 0) ||
(gen(pos) > initial_state.Get(key(pos)))
) << "key: " << key(pos)
<< "; gen: " << gen(pos)
<< "; initgen: "
<< initial_state.Get(key(pos));
// Advance to next key in the valid key space
if (key(pos) < key(current)) {
pos = MakeKey(key(pos) + 1, 0);
} else {
pos = MakeKey(key(pos), gen(pos) + 1);
}
}
if (!iter.Valid()) {
break;
}
if (rnd->Next() % 2) {
iter.Next();
pos = MakeKey(key(pos), gen(pos) + 1);
} else {
Key new_target = RandomTarget(rnd);
if (new_target > pos) {
pos = new_target;
iter.Seek(new_target);
}
}
}
}
};
const uint32_t ConcurrentTest::K;
// Simple test that does single-threaded testing of the ConcurrentTest
// scaffolding.
TEST(SkipTest, ConcurrentWithoutThreads) {
ConcurrentTest test;
Random rnd(test::RandomSeed());
for (int i = 0; i < 10000; i++) {
test.ReadStep(&rnd);
test.WriteStep(&rnd);
}
}
class TestState {
public:
ConcurrentTest t_;
int seed_;
port::AtomicPointer quit_flag_;
enum ReaderState {
STARTING,
RUNNING,
DONE
};
explicit TestState(int s)
: seed_(s),
quit_flag_(NULL),
state_(STARTING),
state_cv_(&mu_) {}
void Wait(ReaderState s) {
mu_.Lock();
while (state_ != s) {
state_cv_.Wait();
}
mu_.Unlock();
}
void Change(ReaderState s) {
mu_.Lock();
state_ = s;
state_cv_.Signal();
mu_.Unlock();
}
private:
port::Mutex mu_;
ReaderState state_;
port::CondVar state_cv_;
};
static void ConcurrentReader(void* arg) {
TestState* state = reinterpret_cast<TestState*>(arg);
Random rnd(state->seed_);
int64_t reads = 0;
state->Change(TestState::RUNNING);
while (!state->quit_flag_.Acquire_Load()) {
state->t_.ReadStep(&rnd);
++reads;
}
state->Change(TestState::DONE);
}
static void RunConcurrent(int run) {
const int seed = test::RandomSeed() + (run * 100);
Random rnd(seed);
const int N = 1000;
const int kSize = 1000;
for (int i = 0; i < N; i++) {
if ((i % 100) == 0) {
fprintf(stderr, "Run %d of %d\n", i, N);
}
TestState state(seed + 1);
Env::Default()->Schedule(ConcurrentReader, &state);
state.Wait(TestState::RUNNING);
for (int i = 0; i < kSize; i++) {
state.t_.WriteStep(&rnd);
}
state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do
state.Wait(TestState::DONE);
}
}
TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

66
db/snapshot.h Normal file
View File

@@ -0,0 +1,66 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
#include "include/db.h"
namespace leveldb {
class SnapshotList;
// Snapshots are kept in a doubly-linked list in the DB.
// Each Snapshot corresponds to a particular sequence number.
class Snapshot {
public:
SequenceNumber number_; // const after creation
private:
friend class SnapshotList;
// Snapshot is kept in a doubly-linked circular list
Snapshot* prev_;
Snapshot* next_;
SnapshotList* list_; // just for sanity checks
};
class SnapshotList {
public:
SnapshotList() {
list_.prev_ = &list_;
list_.next_ = &list_;
}
bool empty() const { return list_.next_ == &list_; }
Snapshot* oldest() const { assert(!empty()); return list_.next_; }
Snapshot* newest() const { assert(!empty()); return list_.prev_; }
const Snapshot* New(SequenceNumber seq) {
Snapshot* s = new Snapshot;
s->number_ = seq;
s->list_ = this;
s->next_ = &list_;
s->prev_ = list_.prev_;
s->prev_->next_ = s;
s->next_->prev_ = s;
return s;
}
void Delete(const Snapshot* s) {
assert(s->list_ == this);
s->prev_->next_ = s->next_;
s->next_->prev_ = s->prev_;
delete s;
}
private:
// Dummy head of doubly-linked list of snapshots
Snapshot list_;
};
}
#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_

94
db/table_cache.cc Normal file
View File

@@ -0,0 +1,94 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/table_cache.h"
#include "db/filename.h"
#include "include/env.h"
#include "include/table.h"
#include "util/coding.h"
namespace leveldb {
struct TableAndFile {
RandomAccessFile* file;
Table* table;
};
static void DeleteEntry(const Slice& key, void* value) {
TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
delete tf->table;
delete tf->file;
delete tf;
}
static void UnrefEntry(void* arg1, void* arg2) {
Cache* cache = reinterpret_cast<Cache*>(arg1);
Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
cache->Release(h);
}
TableCache::TableCache(const std::string& dbname,
const Options* options,
int entries)
: env_(options->env),
dbname_(dbname),
options_(options),
cache_(NewLRUCache(entries)) {
}
TableCache::~TableCache() {
delete cache_;
}
Iterator* TableCache::NewIterator(const ReadOptions& options,
uint64_t file_number,
Table** tableptr) {
if (tableptr != NULL) {
*tableptr = NULL;
}
char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number);
Slice key(buf, sizeof(buf));
Cache::Handle* handle = cache_->Lookup(key);
if (handle == NULL) {
std::string fname = TableFileName(dbname_, file_number);
RandomAccessFile* file = NULL;
Table* table = NULL;
Status s = env_->NewRandomAccessFile(fname, &file);
if (s.ok()) {
s = Table::Open(*options_, file, &table);
}
if (!s.ok()) {
assert(table == NULL);
delete file;
// We do not cache error results so that if the error is transient,
// or somebody repairs the file, we recover automatically.
return NewErrorIterator(s);
}
TableAndFile* tf = new TableAndFile;
tf->file = file;
tf->table = table;
handle = cache_->Insert(key, tf, 1, &DeleteEntry);
}
Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
Iterator* result = table->NewIterator(options);
result->RegisterCleanup(&UnrefEntry, cache_, handle);
if (tableptr != NULL) {
*tableptr = table;
}
return result;
}
void TableCache::Evict(uint64_t file_number) {
char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number);
cache_->Erase(Slice(buf, sizeof(buf)));
}
}

49
db/table_cache.h Normal file
View File

@@ -0,0 +1,49 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread-safe (provides internal synchronization)
#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_
#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_
#include <string>
#include <stdint.h>
#include "db/dbformat.h"
#include "include/cache.h"
#include "include/table.h"
#include "port/port.h"
namespace leveldb {
class Env;
class TableCache {
public:
TableCache(const std::string& dbname, const Options* options, int entries);
~TableCache();
// Get an iterator for the specified file number and return it. If
// "tableptr" is non-NULL, also sets "*tableptr" to point to the
// Table object underlying the returned iterator, or NULL if no
// Table object underlies the returned iterator. The returned
// "*tableptr" object is owned by the cache and should not be
// deleted, and is valid for as long as the returned iterator is
// live.
Iterator* NewIterator(const ReadOptions& options,
uint64_t file_number,
Table** tableptr = NULL);
// Evict any entry for the specified file number
void Evict(uint64_t file_number);
private:
Env* const env_;
const std::string dbname_;
const Options* options_;
Cache* cache_;
};
}
#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_

282
db/version_edit.cc Normal file
View File

@@ -0,0 +1,282 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "db/version_set.h"
#include "util/coding.h"
namespace leveldb {
// Tag numbers for serialized VersionEdit. These numbers are written to
// disk and should not be changed.
enum Tag {
kComparator = 1,
kLogNumber = 2,
kNextFileNumber = 3,
kLastSequence = 4,
kCompactPointer = 5,
kDeletedFile = 6,
kNewFile = 7,
kLargeValueRef = 8,
};
void VersionEdit::Clear() {
comparator_.clear();
log_number_ = 0;
last_sequence_ = 0;
next_file_number_ = 0;
has_comparator_ = false;
has_log_number_ = false;
has_next_file_number_ = false;
has_last_sequence_ = false;
deleted_files_.clear();
new_files_.clear();
large_refs_added_.clear();
}
void VersionEdit::EncodeTo(std::string* dst) const {
if (has_comparator_) {
PutVarint32(dst, kComparator);
PutLengthPrefixedSlice(dst, comparator_);
}
if (has_log_number_) {
PutVarint32(dst, kLogNumber);
PutVarint64(dst, log_number_);
}
if (has_next_file_number_) {
PutVarint32(dst, kNextFileNumber);
PutVarint64(dst, next_file_number_);
}
if (has_last_sequence_) {
PutVarint32(dst, kLastSequence);
PutVarint64(dst, last_sequence_);
}
for (int i = 0; i < compact_pointers_.size(); i++) {
PutVarint32(dst, kCompactPointer);
PutVarint32(dst, compact_pointers_[i].first); // level
PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
PutVarint32(dst, kDeletedFile);
PutVarint32(dst, iter->first); // level
PutVarint64(dst, iter->second); // file number
}
for (int i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
PutVarint32(dst, kNewFile);
PutVarint32(dst, new_files_[i].first); // level
PutVarint64(dst, f.number);
PutVarint64(dst, f.file_size);
PutLengthPrefixedSlice(dst, f.smallest.Encode());
PutLengthPrefixedSlice(dst, f.largest.Encode());
}
for (int i = 0; i < large_refs_added_.size(); i++) {
const VersionEdit::Large& l = large_refs_added_[i];
PutVarint32(dst, kLargeValueRef);
PutLengthPrefixedSlice(dst,
Slice(l.large_ref.data, LargeValueRef::ByteSize()));
PutVarint64(dst, l.fnum);
PutLengthPrefixedSlice(dst, l.internal_key.Encode());
}
}
static bool GetInternalKey(Slice* input, InternalKey* dst) {
Slice str;
if (GetLengthPrefixedSlice(input, &str)) {
dst->DecodeFrom(str);
return true;
} else {
return false;
}
}
static bool GetLevel(Slice* input, int* level) {
uint32_t v;
if (GetVarint32(input, &v) &&
v < config::kNumLevels) {
*level = v;
return true;
} else {
return false;
}
}
Status VersionEdit::DecodeFrom(const Slice& src) {
Clear();
Slice input = src;
const char* msg = NULL;
uint32_t tag;
// Temporary storage for parsing
int level;
uint64_t number;
FileMetaData f;
Slice str;
Large large;
InternalKey key;
while (msg == NULL && GetVarint32(&input, &tag)) {
switch (tag) {
case kComparator:
if (GetLengthPrefixedSlice(&input, &str)) {
comparator_ = str.ToString();
has_comparator_ = true;
} else {
msg = "comparator name";
}
break;
case kLogNumber:
if (GetVarint64(&input, &log_number_)) {
has_log_number_ = true;
} else {
msg = "log number";
}
break;
case kNextFileNumber:
if (GetVarint64(&input, &next_file_number_)) {
has_next_file_number_ = true;
} else {
msg = "next file number";
}
break;
case kLastSequence:
if (GetVarint64(&input, &last_sequence_)) {
has_last_sequence_ = true;
} else {
msg = "last sequence number";
}
break;
case kCompactPointer:
if (GetLevel(&input, &level) &&
GetInternalKey(&input, &key)) {
compact_pointers_.push_back(std::make_pair(level, key));
} else {
msg = "compaction pointer";
}
break;
case kDeletedFile:
if (GetLevel(&input, &level) &&
GetVarint64(&input, &number)) {
deleted_files_.insert(std::make_pair(level, number));
} else {
msg = "deleted file";
}
break;
case kNewFile:
if (GetLevel(&input, &level) &&
GetVarint64(&input, &f.number) &&
GetVarint64(&input, &f.file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest)) {
new_files_.push_back(std::make_pair(level, f));
} else {
msg = "new-file entry";
}
break;
case kLargeValueRef:
if (GetLengthPrefixedSlice(&input, &str) &&
(str.size() == LargeValueRef::ByteSize()) &&
GetVarint64(&input, &large.fnum) &&
GetInternalKey(&input, &large.internal_key)) {
large.large_ref = LargeValueRef::FromRef(str);
large_refs_added_.push_back(large);
} else {
msg = "large ref";
}
break;
default:
msg = "unknown tag";
break;
}
}
if (msg == NULL && !input.empty()) {
msg = "invalid tag";
}
Status result;
if (msg != NULL) {
result = Status::Corruption("VersionEdit", msg);
}
return result;
}
std::string VersionEdit::DebugString() const {
std::string r;
r.append("VersionEdit {");
if (has_comparator_) {
r.append("\n Comparator: ");
r.append(comparator_);
}
if (has_log_number_) {
r.append("\n LogNumber: ");
AppendNumberTo(&r, log_number_);
}
if (has_next_file_number_) {
r.append("\n NextFile: ");
AppendNumberTo(&r, next_file_number_);
}
if (has_last_sequence_) {
r.append("\n LastSeq: ");
AppendNumberTo(&r, last_sequence_);
}
for (int i = 0; i < compact_pointers_.size(); i++) {
r.append("\n CompactPointer: ");
AppendNumberTo(&r, compact_pointers_[i].first);
r.append(" '");
AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode());
r.append("'");
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
r.append("\n DeleteFile: ");
AppendNumberTo(&r, iter->first);
r.append(" ");
AppendNumberTo(&r, iter->second);
}
for (int i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
r.append("\n AddFile: ");
AppendNumberTo(&r, new_files_[i].first);
r.append(" ");
AppendNumberTo(&r, f.number);
r.append(" ");
AppendNumberTo(&r, f.file_size);
r.append(" '");
AppendEscapedStringTo(&r, f.smallest.Encode());
r.append("' .. '");
AppendEscapedStringTo(&r, f.largest.Encode());
r.append("'");
}
for (int i = 0; i < large_refs_added_.size(); i++) {
const VersionEdit::Large& l = large_refs_added_[i];
r.append("\n LargeRef: ");
AppendNumberTo(&r, l.fnum);
r.append(" ");
r.append(LargeValueRefToFilenameString(l.large_ref));
r.append(" '");
AppendEscapedStringTo(&r, l.internal_key.Encode());
r.append("'");
}
r.append("\n}\n");
return r;
}
}

118
db/version_edit.h Normal file
View File

@@ -0,0 +1,118 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_
#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_
#include <set>
#include <utility>
#include <vector>
#include "db/dbformat.h"
namespace leveldb {
class VersionSet;
struct FileMetaData {
int refs;
uint64_t number;
uint64_t file_size; // File size in bytes
InternalKey smallest; // Smallest internal key served by table
InternalKey largest; // Largest internal key served by table
FileMetaData() : refs(0), file_size(0) { }
};
class VersionEdit {
public:
VersionEdit() { Clear(); }
~VersionEdit() { }
void Clear();
void SetComparatorName(const Slice& name) {
has_comparator_ = true;
comparator_ = name.ToString();
}
void SetLogNumber(uint64_t num) {
has_log_number_ = true;
log_number_ = num;
}
void SetNextFile(uint64_t num) {
has_next_file_number_ = true;
next_file_number_ = num;
}
void SetLastSequence(SequenceNumber seq) {
has_last_sequence_ = true;
last_sequence_ = seq;
}
void SetCompactPointer(int level, const InternalKey& key) {
compact_pointers_.push_back(std::make_pair(level, key));
}
// Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
void AddFile(int level, uint64_t file,
uint64_t file_size,
const InternalKey& smallest,
const InternalKey& largest) {
FileMetaData f;
f.number = file;
f.file_size = file_size;
f.smallest = smallest;
f.largest = largest;
new_files_.push_back(std::make_pair(level, f));
}
// Delete the specified "file" from the specified "level".
void DeleteFile(int level, uint64_t file) {
deleted_files_.insert(std::make_pair(level, file));
}
// Record that a large value with the specified large_ref was
// written to the output file numbered "fnum"
void AddLargeValueRef(const LargeValueRef& large_ref,
uint64_t fnum,
const Slice& internal_key) {
large_refs_added_.resize(large_refs_added_.size() + 1);
Large* large = &(large_refs_added_.back());
large->large_ref = large_ref;
large->fnum = fnum;
large->internal_key.DecodeFrom(internal_key);
}
void EncodeTo(std::string* dst) const;
Status DecodeFrom(const Slice& src);
std::string DebugString() const;
private:
friend class VersionSet;
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
std::string comparator_;
uint64_t log_number_;
uint64_t next_file_number_;
SequenceNumber last_sequence_;
bool has_comparator_;
bool has_log_number_;
bool has_next_file_number_;
bool has_last_sequence_;
std::vector< std::pair<int, InternalKey> > compact_pointers_;
DeletedFileSet deleted_files_;
std::vector< std::pair<int, FileMetaData> > new_files_;
struct Large {
LargeValueRef large_ref;
uint64_t fnum;
InternalKey internal_key;
};
std::vector<Large> large_refs_added_;
};
}
#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_

50
db/version_edit_test.cc Normal file
View File

@@ -0,0 +1,50 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "util/testharness.h"
namespace leveldb {
static void TestEncodeDecode(const VersionEdit& edit) {
std::string encoded, encoded2;
edit.EncodeTo(&encoded);
VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString();
parsed.EncodeTo(&encoded2);
ASSERT_EQ(encoded, encoded2);
}
class VersionEditTest { };
TEST(VersionEditTest, EncodeDecode) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit;
for (int i = 0; i < 4; i++) {
TestEncodeDecode(edit);
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef),
InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
edit.DeleteFile(4, kBig + 700 + i);
edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression),
kBig + 800 + i, "foobar");
edit.AddLargeValueRef(LargeValueRef::Make("big2", kLightweightCompression),
kBig + 801 + i, "baz");
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
}
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
TestEncodeDecode(edit);
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

1003
db/version_set.cc Normal file

File diff suppressed because it is too large Load Diff

290
db/version_set.h Normal file
View File

@@ -0,0 +1,290 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// The representation of a DBImpl consists of a set of Versions. The
// newest version is called "current". Older versions may be kept
// around to provide a consistent view to live iterators.
//
// Each Version keeps track of a set of Table files per level. The
// entire set of versions is maintained in a VersionSet.
//
// Version,VersionSet are thread-compatible, but require external
// synchronization on all accesses.
#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
#define STORAGE_LEVELDB_DB_VERSION_SET_H_
#include <map>
#include <set>
#include <vector>
#include "db/dbformat.h"
#include "db/version_edit.h"
#include "port/port.h"
namespace leveldb {
// Grouping of constants. We may want to make some of these
// parameters set via options.
namespace config {
static const int kNumLevels = 7;
}
namespace log { class Writer; }
class Compaction;
class Iterator;
class MemTable;
class TableBuilder;
class TableCache;
class Version;
class VersionSet;
class WritableFile;
class Version {
public:
// Append to *iters a sequence of iterators that will
// yield the contents of this Version when merged together.
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
// Reference count management (so Versions do not disappear out from
// under live iterators)
void Ref();
void Unref();
// Return a human readable string that describes this version's contents.
std::string DebugString() const;
private:
friend class Compaction;
friend class VersionSet;
class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list
int refs_; // Number of live refs to this version
MemTable* cleanup_mem_; // NULL, or table to delete when version dropped
// List of files per level
std::vector<FileMetaData*> files_[config::kNumLevels];
// Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields
// are initialized by Finalize().
double compaction_score_;
int compaction_level_;
explicit Version(VersionSet* vset)
: vset_(vset), next_(NULL), refs_(0),
cleanup_mem_(NULL),
compaction_score_(-1),
compaction_level_(-1) {
}
~Version();
// No copying allowed
Version(const Version&);
void operator=(const Version&);
};
class VersionSet {
public:
VersionSet(const std::string& dbname,
const Options* options,
TableCache* table_cache,
const InternalKeyComparator*);
~VersionSet();
// Apply *edit to the current version to form a new descriptor that
// is both saved to persistent state and installed as the new
// current version. Iff Apply() returns OK, arrange to delete
// cleanup_mem (if cleanup_mem != NULL) when it is no longer needed
// by older versions.
Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem);
// Recover the last saved descriptor from persistent storage.
Status Recover(uint64_t* log_number, SequenceNumber* last_sequence);
// Save current contents to *log
Status WriteSnapshot(log::Writer* log);
// Return the current version.
Version* current() const { return current_; }
// Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
// Allocate and return a new file number
uint64_t NewFileNumber() { return next_file_number_++; }
// Return the number of Table files at the specified level.
int NumLevelFiles(int level) const;
// Pick level and inputs for a new compaction.
// Returns NULL if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result.
Compaction* PickCompaction();
// Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns NULL if there is nothing in that
// level that overlaps the specified range. Caller should delete
// the result.
Compaction* CompactRange(
int level,
const InternalKey& begin,
const InternalKey& end);
// Create an iterator that reads over the compaction inputs for "*c".
// The caller should delete the iterator when no longer needed.
Iterator* MakeInputIterator(Compaction* c);
// Returns true iff some level needs a compaction.
bool NeedsCompaction() const { return current_->compaction_score_ >= 1; }
// Add all files listed in any live version to *live.
// May also mutate some internal state.
void AddLiveFiles(std::set<uint64_t>* live);
// Return the approximate offset in the database of the data for
// "key" as of version "v".
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Register a reference to a large value with the specified
// large_ref from the specified file number. Returns "true" if this
// is the first recorded reference to the "large_ref" value in the
// database, and false otherwise.
bool RegisterLargeValueRef(const LargeValueRef& large_ref,
uint64_t filenum,
const InternalKey& internal_key);
// Cleanup the large value reference state by eliminating any
// references from files that are not includes in either "live_tables"
// or "log_file".
void CleanupLargeValueRefs(const std::set<uint64_t>& live_tables,
uint64_t log_file_num);
// Returns true if a large value with the given reference is live.
bool LargeValueIsLive(const LargeValueRef& large_ref);
private:
class Builder;
friend class Compaction;
friend class Version;
Status Finalize(Version* v);
// Delete any old versions that are no longer needed.
void MaybeDeleteOldVersions();
struct BySmallestKey;
Status SortLevel(Version* v, uint64_t level);
void GetOverlappingInputs(
int level,
const InternalKey& begin,
const InternalKey& end,
std::vector<FileMetaData*>* inputs);
void GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest,
InternalKey* largest);
Env* const env_;
const std::string dbname_;
const Options* const options_;
TableCache* const table_cache_;
const InternalKeyComparator icmp_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
// Opened lazily
WritableFile* descriptor_file_;
log::Writer* descriptor_log_;
// Versions are kept in a singly linked list that is never empty
Version* current_; // Pointer to the last (newest) list entry
Version* oldest_; // Pointer to the first (oldest) list entry
// Map from large value reference to the set of <file numbers,internal_key>
// values containing references to the value. We keep the
// internal key as a std::string rather than as an InternalKey because
// we want to be able to easily use a set.
typedef std::set<std::pair<uint64_t, std::string> > LargeReferencesSet;
typedef std::map<LargeValueRef, LargeReferencesSet> LargeValueMap;
LargeValueMap large_value_refs_;
// Per-level key at which the next compaction at that level should start.
// Either an empty string, or a valid InternalKey.
std::string compact_pointer_[config::kNumLevels];
// No copying allowed
VersionSet(const VersionSet&);
void operator=(const VersionSet&);
};
// A Compaction encapsulates information about a compaction.
class Compaction {
public:
~Compaction();
// Return the level that is being compacted. Inputs from "level"
// and "level+1" will be merged to produce a set of "level+1" files.
int level() const { return level_; }
// Return the object that holds the edits to the descriptor done
// by this compaction.
VersionEdit* edit() { return &edit_; }
// "which" must be either 0 or 1
int num_input_files(int which) const { return inputs_[which].size(); }
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
// Maximum size of files to build during this compaction.
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
// Add all inputs to this compaction as delete operations to *edit.
void AddInputDeletions(VersionEdit* edit);
// Returns true if the information we have available guarantees that
// the compaction is producing data in "level+1" for which no data exists
// in levels greater than "level+1".
bool IsBaseLevelForKey(const Slice& user_key);
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
private:
friend class Version;
friend class VersionSet;
explicit Compaction(int level);
int level_;
uint64_t max_output_file_size_;
Version* input_version_;
VersionEdit edit_;
// Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
// State for implementing IsBaseLevelForKey
// level_ptrs_ holds indices into input_version_->levels_: our state
// is that we are positioned at one of the file ranges for each
// higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2).
int level_ptrs_[config::kNumLevels];
};
}
#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_

164
db/write_batch.cc Normal file
View File

@@ -0,0 +1,164 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// WriteBatch::rep_ :=
// sequence: fixed64
// count: fixed32
// data: record[count]
// record :=
// kTypeValue varstring varstring |
// kTypeLargeValueRef varstring varstring |
// kTypeDeletion varstring
// varstring :=
// len: varint32
// data: uint8[len]
#include "include/write_batch.h"
#include "include/db.h"
#include "db/dbformat.h"
#include "db/memtable.h"
#include "db/write_batch_internal.h"
#include "util/coding.h"
namespace leveldb {
WriteBatch::WriteBatch() {
Clear();
}
WriteBatch::~WriteBatch() { }
void WriteBatch::Clear() {
rep_.clear();
rep_.resize(12);
}
int WriteBatchInternal::Count(const WriteBatch* b) {
return DecodeFixed32(b->rep_.data() + 8);
}
void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
EncodeFixed32(&b->rep_[8], n);
}
SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
return SequenceNumber(DecodeFixed64(b->rep_.data()));
}
void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
EncodeFixed64(&b->rep_[0], seq);
}
void WriteBatch::Put(const Slice& key, const Slice& value) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeValue));
PutLengthPrefixedSlice(&rep_, key);
PutLengthPrefixedSlice(&rep_, value);
}
void WriteBatchInternal::PutLargeValueRef(WriteBatch* b,
const Slice& key,
const LargeValueRef& large_ref) {
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
b->rep_.push_back(static_cast<char>(kTypeLargeValueRef));
PutLengthPrefixedSlice(&b->rep_, key);
PutLengthPrefixedSlice(&b->rep_,
Slice(large_ref.data, sizeof(large_ref.data)));
}
void WriteBatch::Delete(const Slice& key) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeDeletion));
PutLengthPrefixedSlice(&rep_, key);
}
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
MemTable* memtable) {
const int count = WriteBatchInternal::Count(b);
int found = 0;
Iterator it(*b);
for (; !it.Done(); it.Next()) {
switch (it.op()) {
case kTypeDeletion:
memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice());
break;
case kTypeValue:
memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value());
break;
case kTypeLargeValueRef:
memtable->Add(it.sequence_number(), kTypeLargeValueRef,
it.key(), it.value());
break;
}
found++;
}
if (!it.status().ok()) {
return it.status();
} else if (found != count) {
return Status::Corruption("wrong count in WriteBatch");
}
return Status::OK();
}
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
assert(contents.size() >= 12);
b->rep_.assign(contents.data(), contents.size());
}
WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch)
: input_(WriteBatchInternal::Contents(&batch)),
done_(false) {
if (input_.size() < 12) {
done_ = true;
} else {
seq_ = WriteBatchInternal::Sequence(&batch),
input_.remove_prefix(12);
GetNextEntry();
}
}
void WriteBatchInternal::Iterator::Next() {
assert(!done_);
seq_++;
GetNextEntry();
}
void WriteBatchInternal::Iterator::GetNextEntry() {
if (input_.empty()) {
done_ = true;
return;
}
char tag = input_[0];
input_.remove_prefix(1);
switch (tag) {
case kTypeValue:
case kTypeLargeValueRef:
if (GetLengthPrefixedSlice(&input_, &key_) &&
GetLengthPrefixedSlice(&input_, &value_)) {
op_ = static_cast<ValueType>(tag);
} else {
status_ = Status::Corruption("bad WriteBatch Put");
done_ = true;
input_.clear();
}
break;
case kTypeDeletion:
if (GetLengthPrefixedSlice(&input_, &key_)) {
op_ = kTypeDeletion;
} else {
status_ = Status::Corruption("bad WriteBatch Delete");
done_ = true;
input_.clear();
}
break;
default:
status_ = Status::Corruption("unknown WriteBatch tag");
done_ = true;
input_.clear();
break;
}
}
}

73
db/write_batch_internal.h Normal file
View File

@@ -0,0 +1,73 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
#include "include/write_batch.h"
namespace leveldb {
// WriteBatchInternal provides static methods for manipulating a
// WriteBatch that we don't want in the public WriteBatch interface.
class WriteBatchInternal {
public:
static void PutLargeValueRef(WriteBatch* batch,
const Slice& key,
const LargeValueRef& large_ref);
// Return the number of entries in the batch.
static int Count(const WriteBatch* batch);
// Set the count for the number of entries in the batch.
static void SetCount(WriteBatch* batch, int n);
// Return the seqeunce number for the start of this batch.
static SequenceNumber Sequence(const WriteBatch* batch);
// Store the specified number as the seqeunce number for the start of
// this batch.
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
static Slice Contents(const WriteBatch* batch) {
return Slice(batch->rep_);
}
static size_t ByteSize(const WriteBatch* batch) {
return batch->rep_.size();
}
static void SetContents(WriteBatch* batch, const Slice& contents);
static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
// Iterate over the contents of a write batch.
class Iterator {
public:
explicit Iterator(const WriteBatch& batch);
bool Done() const { return done_; }
void Next();
ValueType op() const { return op_; }
const Slice& key() const { return key_; }
const Slice& value() const { return value_; }
SequenceNumber sequence_number() const { return seq_; }
Status status() const { return status_; }
private:
void GetNextEntry();
Slice input_;
bool done_;
ValueType op_;
Slice key_;
Slice value_;
SequenceNumber seq_;
Status status_;
};
};
}
#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_

110
db/write_batch_test.cc Normal file
View File

@@ -0,0 +1,110 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "include/db.h"
#include "db/memtable.h"
#include "db/write_batch_internal.h"
#include "include/env.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
static std::string PrintContents(WriteBatch* b) {
InternalKeyComparator cmp(BytewiseComparator());
MemTable mem(cmp);
std::string state;
Status s = WriteBatchInternal::InsertInto(b, &mem);
Iterator* iter = mem.NewIterator();
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey ikey;
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
switch (ikey.type) {
case kTypeValue:
state.append("Put(");
state.append(ikey.user_key.ToString());
state.append(", ");
state.append(iter->value().ToString());
state.append(")");
break;
case kTypeLargeValueRef:
state.append("PutRef(");
state.append(ikey.user_key.ToString());
state.append(", ");
state.append(iter->value().ToString());
state.append(")");
break;
case kTypeDeletion:
state.append("Delete(");
state.append(ikey.user_key.ToString());
state.append(")");
break;
}
state.append("@");
state.append(NumberToString(ikey.sequence));
}
delete iter;
if (!s.ok()) {
state.append("ParseError()");
}
return state;
}
class WriteBatchTest { };
TEST(WriteBatchTest, Empty) {
WriteBatch batch;
ASSERT_EQ("", PrintContents(&batch));
ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
}
TEST(WriteBatchTest, Multiple) {
WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar"));
batch.Delete(Slice("box"));
batch.Put(Slice("baz"), Slice("boo"));
WriteBatchInternal::SetSequence(&batch, 100);
ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
ASSERT_EQ("Put(baz, boo)@102"
"Delete(box)@101"
"Put(foo, bar)@100",
PrintContents(&batch));
}
TEST(WriteBatchTest, PutIndirect) {
WriteBatch batch;
batch.Put(Slice("baz"), Slice("boo"));
LargeValueRef h;
for (int i = 0; i < LargeValueRef::ByteSize(); i++) {
h.data[i] = (i < 20) ? 'a' : 'b';
}
WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h);
WriteBatchInternal::SetSequence(&batch, 100);
ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
ASSERT_EQ(2, WriteBatchInternal::Count(&batch));
ASSERT_EQ("Put(baz, boo)@100"
"PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101",
PrintContents(&batch));
}
TEST(WriteBatchTest, Corruption) {
WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar"));
batch.Delete(Slice("box"));
WriteBatchInternal::SetSequence(&batch, 200);
Slice contents = WriteBatchInternal::Contents(&batch);
WriteBatchInternal::SetContents(&batch,
Slice(contents.data(),contents.size()-1));
ASSERT_EQ("Put(foo, bar)@200"
"ParseError()",
PrintContents(&batch));
}
}
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}