Initial checkin.

git-svn-id: https://leveldb.googlecode.com/svn/trunk@2 62dab493-f737-651d-591e-8d6aee1b9529
2025-12-06 17:27:55 +00:00 · 2011-03-18 22:37:00 +00:00
parent 54f1fd7eef
commit f67e15e50f
118 changed files with 19180 additions and 0 deletions
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "include/db.h"
+#include "include/env.h"
+#include "include/iterator.h"
+
+namespace leveldb {
+
+Status BuildTable(const std::string& dbname,
+                  Env* env,
+                  const Options& options,
+                  TableCache* table_cache,
+                  Iterator* iter,
+                  FileMetaData* meta,
+                  VersionEdit* edit) {
+  Status s;
+  meta->file_size = 0;
+  iter->SeekToFirst();
+
+  std::string fname = TableFileName(dbname, meta->number);
+  if (iter->Valid()) {
+    WritableFile* file;
+    s = env->NewWritableFile(fname, &file);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TableBuilder* builder = new TableBuilder(options, file);
+    meta->smallest.DecodeFrom(iter->key());
+    for (; iter->Valid(); iter->Next()) {
+      Slice key = iter->key();
+      meta->largest.DecodeFrom(key);
+      if (ExtractValueType(key) == kTypeLargeValueRef) {
+        if (iter->value().size() != LargeValueRef::ByteSize()) {
+          s = Status::Corruption("invalid indirect reference hash value (L0)");
+          break;
+        }
+        edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
+                               meta->number,
+                               iter->key());
+      }
+      builder->Add(key, iter->value());
+    }
+
+    // Finish and check for builder errors
+    if (s.ok()) {
+      s = builder->Finish();
+      if (s.ok()) {
+        meta->file_size = builder->FileSize();
+        assert(meta->file_size > 0);
+      }
+    } else {
+      builder->Abandon();
+    }
+    delete builder;
+
+    // Finish and check for file errors
+    if (s.ok()) {
+      s = file->Sync();
+    }
+    if (s.ok()) {
+      s = file->Close();
+    }
+    delete file;
+    file = NULL;
+
+    if (s.ok()) {
+      // Verify that the table is usable
+      Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number);
+      s = it->status();
+      delete it;
+    }
+  }
+
+  // Check for input iterator errors
+  if (!iter->status().ok()) {
+    s = iter->status();
+  }
+
+  if (s.ok() && meta->file_size > 0) {
+    edit->AddFile(0, meta->number, meta->file_size,
+                  meta->smallest, meta->largest);
+  } else {
+    env->DeleteFile(fname);
+  }
+  return s;
+}
+
+}
--- a/db/builder.h
+++ b/db/builder.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_BUILDER_H_
+#define STORAGE_LEVELDB_DB_BUILDER_H_
+
+#include "include/status.h"
+
+namespace leveldb {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+class Iterator;
+class TableCache;
+class VersionEdit;
+
+// Build a Table file from the contents of *iter.  The generated file
+// will be named according to meta->number.  On success, the rest of
+// *meta will be filled with metadata about the generated table, and
+// large value refs and the added file information will be added to
+// *edit.  If no data is present in *iter, meta->file_size will be set
+// to zero, and no Table file will be produced.
+extern Status BuildTable(const std::string& dbname,
+                         Env* env,
+                         const Options& options,
+                         TableCache* table_cache,
+                         Iterator* iter,
+                         FileMetaData* meta,
+                         VersionEdit* edit);
+
+}
+
+#endif  // STORAGE_LEVELDB_DB_BUILDER_H_
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -0,0 +1,366 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/db.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "include/env.h"
+#include "include/table.h"
+#include "include/write_batch.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest {
+ public:
+  test::ErrorEnv env_;
+  Random rnd_;
+  std::string dbname_;
+  Options options_;
+  DB* db_;
+
+  CorruptionTest() : rnd_(test::RandomSeed()) {
+    options_.env = &env_;
+    dbname_ = test::TmpDir() + "/db_test";
+    DestroyDB(dbname_, options_);
+
+    db_ = NULL;
+    options_.create_if_missing = true;
+    Reopen();
+    options_.create_if_missing = false;
+  }
+
+  ~CorruptionTest() {
+     delete db_;
+     DestroyDB(dbname_, Options());
+  }
+
+  Status TryReopen(Options* options = NULL) {
+    delete db_;
+    db_ = NULL;
+    Options opt = (options ? *options : options_);
+    opt.env = &env_;
+    return DB::Open(opt, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = NULL) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void RepairDB() {
+    delete db_;
+    db_ = NULL;
+    ASSERT_OK(::leveldb::RepairDB(dbname_, options_));
+  }
+
+  void Build(int n) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = 0; i < n; i++) {
+      //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void Check(int min_expected, int max_expected) {
+    int next_expected = 0;
+    int missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      uint64_t key;
+      Slice in(iter->key());
+      if (!ConsumeDecimalNumber(&in, &key) ||
+          !in.empty() ||
+          key < next_expected) {
+        bad_keys++;
+        continue;
+      }
+      missed += (key - next_expected);
+      next_expected = key + 1;
+      if (iter->value() != Value(key, &value_space)) {
+        bad_values++;
+      } else {
+        correct++;
+      }
+    }
+    delete iter;
+
+    fprintf(stderr,
+            "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
+            min_expected, max_expected, correct, bad_keys, bad_values, missed);
+    ASSERT_LE(min_expected, correct);
+    ASSERT_GE(max_expected, correct);
+  }
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    uint64_t number;
+    LargeValueRef large_ref;
+    FileType type;
+    std::vector<std::string> candidates;
+    for (int i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &large_ref, &type) &&
+          type == filetype) {
+        candidates.push_back(dbname_ + "/" + filenames[i]);
+      }
+    }
+    ASSERT_TRUE(!candidates.empty()) << filetype;
+    std::string fname = candidates[rnd_.Uniform(candidates.size())];
+
+    struct stat sbuf;
+    if (stat(fname.c_str(), &sbuf) != 0) {	
+      const char* msg = strerror(errno);	
+      ASSERT_TRUE(false) << fname << ": " << msg;	
+    }	
+
+    if (offset < 0) {	
+      // Relative to end of file; make it absolute	
+      if (-offset > sbuf.st_size) {	
+        offset = 0;	
+      } else {	
+        offset = sbuf.st_size + offset;	
+      }
+    }
+    if (offset > sbuf.st_size) {
+      offset = sbuf.st_size;
+    }
+    if (offset + bytes_to_corrupt > sbuf.st_size) {
+      bytes_to_corrupt = sbuf.st_size - offset;
+    }
+
+    // Do it
+    std::string contents;
+    Status s = ReadFileToString(Env::Default(), fname, &contents);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    for (int i = 0; i < bytes_to_corrupt; i++) {
+      contents[i + offset] ^= 0x80;
+    }
+    s = WriteStringToFile(Env::Default(), contents, fname);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+  }
+
+  uint64_t Property(const std::string& name) {
+    uint64_t result;
+    if (!db_->GetProperty(name, &result)) {
+      result = ~static_cast<uint64_t>(0);
+    }
+    return result;
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    Random r(k);
+    return test::RandomString(&r, kValueSize, storage);
+  }
+};
+
+TEST(CorruptionTest, Recovery) {
+  Build(10);
+  Check(10, 10);
+  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
+  Corrupt(kLogFile, 2*kValueSize, 1);  // Somewhere in second log record?
+  Reopen();
+  Check(8, 8);
+}
+
+TEST(CorruptionTest, RecoverWriteError) {
+  env_.writable_file_error_ = true;
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST(CorruptionTest, NewFileErrorDuringWrite) {
+  // Do enough writing to force minor compaction
+  env_.writable_file_error_ = true;
+  const int num = 3 + (Options().write_buffer_size / kValueSize);
+  std::string value_storage;
+  Status s;
+  for (int i = 0; s.ok() && i < num; i++) {
+    WriteBatch batch;
+    batch.Put("a", Value(100, &value_storage));
+    s = db_->Write(WriteOptions(), &batch);
+  }
+  ASSERT_TRUE(!s.ok());
+  ASSERT_GE(env_.num_writable_file_errors_, 1);
+  env_.writable_file_error_ = false;
+  Reopen();
+}
+
+TEST(CorruptionTest, TableFile) {
+  Build(100);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+  dbi->TEST_CompactRange(0, "", "~");
+  dbi->TEST_CompactRange(1, "", "~");
+
+  Corrupt(kTableFile, 100, 1);
+  Check(99, 99);
+}
+
+TEST(CorruptionTest, TableFileIndexData) {
+  Build(10000);  // Enough to build multiple Tables
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+  dbi->TEST_CompactRange(0, "", "~");
+  dbi->TEST_CompactRange(1, "", "~");
+
+  Corrupt(kTableFile, -1000, 500);
+  Reopen();
+  Check(5000, 9999);
+}
+
+TEST(CorruptionTest, MissingDescriptor) {
+  Build(1000);
+  RepairDB();
+  Reopen();
+  Check(1000, 1000);
+}
+
+TEST(CorruptionTest, SequenceNumberRecovery) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v5", v);
+  // Write something.  If sequence number was not recovered properly,
+  // it will be hidden by an earlier write.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+}
+
+TEST(CorruptionTest, LargeValueRecovery) {
+  Options options;
+  options.large_value_threshold = 10000;
+  Reopen(&options);
+
+  Random rnd(301);
+  std::string big;
+  ASSERT_OK(db_->Put(WriteOptions(),
+                     "foo", test::RandomString(&rnd, 100000, &big)));
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ(big, v);
+
+  RepairDB();
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ(big, v);
+
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ(big, v);
+}
+
+TEST(CorruptionTest, CorruptedDescriptor) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+  dbi->TEST_CompactRange(0, "", "~");
+
+  Corrupt(kDescriptorFile, 0, 1000);
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("hello", v);
+}
+
+TEST(CorruptionTest, CompactionInputError) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+  ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+
+  // Force compactions by writing lots of values
+  Build(10000);
+  Check(10000, 10000);
+  dbi->TEST_CompactRange(0, "", "~");
+  ASSERT_EQ(0, Property("leveldb.num-files-at-level0"));
+}
+
+TEST(CorruptionTest, CompactionInputErrorParanoid) {
+  Options options;
+  options.paranoid_checks = true;
+  Reopen(&options);
+
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+  ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+
+  // Write must eventually fail because of corrupted table
+  Status s;
+  std::string tmp1, tmp2;
+  for (int i = 0; i < 10000 && s.ok(); i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+  }
+  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST(CorruptionTest, UnrelatedKeys) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+  Corrupt(kTableFile, 100, 1);
+
+  std::string tmp1, tmp2;
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+  dbi->TEST_CompactMemTable();
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+}
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -0,0 +1,376 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "include/cache.h"
+#include "include/db.h"
+#include "include/env.h"
+#include "include/write_batch.h"
+#include "util/histogram.h"
+#include "util/random.h"
+#include "util/testutil.h"
+
+// Comma-separated list of operations to run in the specified order
+//   Actual benchmarks:
+//      writeseq    -- write N values in sequential key order
+//      writerandom -- write N values in random key order
+//      writebig    -- write N/1000 100K valuesin random order
+//      readseq     -- read N values sequentially
+//      readrandom  -- read N values in random order
+//   Meta operations:
+//      compact     -- Compact the entire DB
+//      heapprofile -- Dump a heap profile (if supported by this port)
+//      sync        -- switch to synchronous writes (not the default)
+//      nosync      -- switch to asynchronous writes (the default)
+//      tenth       -- divide N by 10 (i.e., following benchmarks are smaller)
+//      normal      -- reset N back to its normal value (1000000)
+static const char* FLAGS_benchmarks =
+    "writeseq,"
+    "writeseq,"
+    "writerandom,"
+    "sync,tenth,tenth,writerandom,nosync,normal,"
+    "readseq,"
+    "readrandom,"
+    "compact,"
+    "readseq,"
+    "readrandom,"
+    "writebig";
+
+// Number of key/values to place in database
+static int FLAGS_num = 1000000;
+
+// Size of each value
+static int FLAGS_value_size = 100;
+
+// Arrange to generate values that shrink to this fraction of
+// their original size after compression
+static double FLAGS_compression_ratio = 0.25;
+
+// Print histogram of operation timings
+static bool FLAGS_histogram = false;
+
+// Number of bytes to buffer in memtable before compacting
+static int FLAGS_write_buffer_size = 1 << 20;
+
+namespace leveldb {
+
+// Helper for quickly generating random data.
+namespace {
+class RandomGenerator {
+ private:
+  std::string data_;
+  int pos_;
+
+ public:
+  RandomGenerator() {
+    // We use a limited amount of data over and over again and ensure
+    // that it is larger than the compression window (32KB), and also
+    // large enough to serve all typical value sizes we want to write.
+    Random rnd(301);
+    std::string piece;
+    while (data_.size() < 1048576) {
+      // Add a short fragment that is as compressible as specified
+      // by FLAGS_compression_ratio.
+      test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
+      data_.append(piece);
+    }
+    pos_ = 0;
+  }
+
+  Slice Generate(int len) {
+    if (pos_ + len > data_.size()) {
+      pos_ = 0;
+      assert(len < data_.size());
+    }
+    pos_ += len;
+    return Slice(data_.data() + pos_ - len, len);
+  }
+};
+}
+
+class Benchmark {
+ private:
+  Cache* cache_;
+  DB* db_;
+  int num_;
+  bool sync_;
+  int heap_counter_;
+  double start_;
+  double last_op_finish_;
+  int64_t bytes_;
+  std::string message_;
+  Histogram hist_;
+  RandomGenerator gen_;
+  Random rand_;
+
+  // State kept for progress messages
+  int done_;
+  int next_report_;     // When to report next
+
+  void Start() {
+    start_ = Env::Default()->NowMicros() * 1e-6;
+    bytes_ = 0;
+    message_.clear();
+    last_op_finish_ = start_;
+    hist_.Clear();
+    done_ = 0;
+    next_report_ = 100;
+  }
+
+  void FinishedSingleOp() {
+    if (FLAGS_histogram) {
+      double now = Env::Default()->NowMicros() * 1e-6;
+      double micros = (now - last_op_finish_) * 1e6;
+      hist_.Add(micros);
+      if (micros > 20000) {
+        fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
+        fflush(stderr);
+      }
+      last_op_finish_ = now;
+    }
+
+    done_++;
+    if (done_ >= next_report_) {
+      if (next_report_ < 1000) {
+        next_report_ += 100;
+      } else if (next_report_ < 10000) {
+        next_report_ += 1000;
+      } else if (next_report_ < 100000) {
+        next_report_ += 10000;
+      } else {
+        next_report_ += 100000;
+      }
+      fprintf(stderr, "... finished %d ops%30s\r", done_, "");
+      fflush(stderr);
+    }
+  }
+
+  void Stop(const Slice& name) {
+    double finish = Env::Default()->NowMicros() * 1e-6;
+
+    // Pretend at least one op was done in case we are running a benchmark
+    // that does nto call FinishedSingleOp().
+    if (done_ < 1) done_ = 1;
+
+    if (bytes_ > 0) {
+      char rate[100];
+      snprintf(rate, sizeof(rate), "%5.1f MB/s",
+               (bytes_ / 1048576.0) / (finish - start_));
+      if (!message_.empty()) {
+        message_.push_back(' ');
+      }
+      message_.append(rate);
+    }
+
+    fprintf(stdout, "%-12s : %10.3f micros/op;%s%s\n",
+            name.ToString().c_str(),
+            (finish - start_) * 1e6 / done_,
+            (message_.empty() ? "" : " "),
+            message_.c_str());
+    if (FLAGS_histogram) {
+      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+    }
+    fflush(stdout);
+  }
+
+ public:
+  enum Order { SEQUENTIAL, RANDOM };
+
+  Benchmark() : cache_(NewLRUCache(200<<20)),
+                db_(NULL),
+                num_(FLAGS_num),
+                sync_(false),
+                heap_counter_(0),
+                bytes_(0),
+                rand_(301) {
+    std::vector<std::string> files;
+    Env::Default()->GetChildren("/tmp/dbbench", &files);
+    for (int i = 0; i < files.size(); i++) {
+      if (Slice(files[i]).starts_with("heap-")) {
+        Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]);
+      }
+    }
+    DestroyDB("/tmp/dbbench", Options());
+  }
+
+  ~Benchmark() {
+    delete db_;
+    delete cache_;
+  }
+
+  void Run() {
+    Options options;
+    options.create_if_missing = true;
+    options.max_open_files = 10000;
+    options.block_cache = cache_;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+
+    Start();
+    Status s = DB::Open(options, "/tmp/dbbench", &db_);
+    Stop("open");
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+
+    const char* benchmarks = FLAGS_benchmarks;
+    while (benchmarks != NULL) {
+      const char* sep = strchr(benchmarks, ',');
+      Slice name;
+      if (sep == NULL) {
+        name = benchmarks;
+        benchmarks = NULL;
+      } else {
+        name = Slice(benchmarks, sep - benchmarks);
+        benchmarks = sep + 1;
+      }
+
+      Start();
+      if (name == Slice("writeseq")) {
+        Write(SEQUENTIAL, num_, FLAGS_value_size);
+      } else if (name == Slice("writerandom")) {
+        Write(RANDOM, num_, FLAGS_value_size);
+      } else if (name == Slice("writebig")) {
+        Write(RANDOM, num_ / 1000, 100 * 1000);
+      } else if (name == Slice("readseq")) {
+        Read(SEQUENTIAL);
+      } else if (name == Slice("readrandom")) {
+        Read(RANDOM);
+      } else if (name == Slice("compact")) {
+        Compact();
+      } else if (name == Slice("heapprofile")) {
+        HeapProfile();
+      } else if (name == Slice("sync")) {
+        sync_ = true;
+      } else if (name == Slice("nosync")) {
+        sync_ = false;
+      } else if (name == Slice("tenth")) {
+        num_ = num_ / 10;
+      } else if (name == Slice("normal")) {
+        num_ = FLAGS_num;
+      } else {
+        fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
+      }
+      Stop(name);
+    }
+  }
+
+  void Write(Order order, int num_entries, int value_size) {
+    WriteBatch batch;
+    Status s;
+    std::string val;
+    WriteOptions options;
+    options.sync = sync_;
+    for (int i = 0; i < num_entries; i++) {
+      const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
+      char key[100];
+      snprintf(key, sizeof(key), "%012d", k);
+      batch.Clear();
+      batch.Put(key, gen_.Generate(value_size));
+      s = db_->Write(options, &batch);
+      bytes_ += value_size + strlen(key);
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      FinishedSingleOp();
+    }
+  }
+
+  void Read(Order order) {
+    ReadOptions options;
+    if (order == SEQUENTIAL) {
+      Iterator* iter = db_->NewIterator(options);
+      int i = 0;
+      for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
+        bytes_ += iter->key().size() + iter->value().size();
+        FinishedSingleOp();
+        ++i;
+      }
+      delete iter;
+    } else {
+      std::string value;
+      for (int i = 0; i < num_; i++) {
+        char key[100];
+        const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
+        snprintf(key, sizeof(key), "%012d", k);
+        db_->Get(options, key, &value);
+        FinishedSingleOp();
+      }
+    }
+  }
+
+  void Compact() {
+    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+    dbi->TEST_CompactMemTable();
+    int max_level_with_files = 1;
+    for (int level = 1; level < config::kNumLevels; level++) {
+      uint64_t v;
+      char name[100];
+      snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
+      if (db_->GetProperty(name, &v) && v > 0) {
+        max_level_with_files = level;
+      }
+    }
+    for (int level = 0; level < max_level_with_files; level++) {
+      dbi->TEST_CompactRange(level, "", "~");
+    }
+  }
+
+  static void WriteToFile(void* arg, const char* buf, int n) {
+    reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
+  }
+
+  void HeapProfile() {
+    char fname[100];
+    snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_);
+    WritableFile* file;
+    Status s = Env::Default()->NewWritableFile(fname, &file);
+    if (!s.ok()) {
+      message_ = s.ToString();
+      return;
+    }
+    bool ok = port::GetHeapProfile(WriteToFile, file);
+    delete file;
+    if (!ok) {
+      message_ = "not supported";
+      Env::Default()->DeleteFile(fname);
+    }
+  }
+};
+
+}
+
+int main(int argc, char** argv) {
+  for (int i = 1; i < argc; i++) {
+    double d;
+    int n;
+    char junk;
+    if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
+      FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
+    } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
+      FLAGS_compression_ratio = d;
+    } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
+               (n == 0 || n == 1)) {
+      FLAGS_histogram = n;
+    } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
+      FLAGS_num = n;
+    } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
+      FLAGS_value_size = n;
+    } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+      FLAGS_write_buffer_size = n;
+    }  else {
+      fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
+      exit(1);
+    }
+  }
+
+  leveldb::Benchmark benchmark;
+  benchmark.Run();
+  return 0;
+}
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -0,0 +1,192 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_
+#define STORAGE_LEVELDB_DB_DB_IMPL_H_
+
+#include <set>
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "include/db.h"
+#include "include/env.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class DBImpl : public DB {
+ public:
+  DBImpl(const Options& options, const std::string& dbname);
+  virtual ~DBImpl();
+
+  // Implementations of the DB interface
+  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
+  virtual Status Delete(const WriteOptions&, const Slice& key);
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     std::string* value);
+  virtual Iterator* NewIterator(const ReadOptions&);
+  virtual const Snapshot* GetSnapshot();
+  virtual void ReleaseSnapshot(const Snapshot* snapshot);
+  virtual bool GetProperty(const Slice& property, uint64_t* value);
+  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
+
+  // Extra methods (for testing) that are not in the public DB interface
+
+  // Compact any files in the named level that overlap [begin,end]
+  void TEST_CompactRange(
+      int level,
+      const std::string& begin,
+      const std::string& end);
+
+  // Force current memtable contents to be compacted.
+  Status TEST_CompactMemTable();
+
+  // Return an internal iterator over the current state of the database.
+  // The keys of this iterator are internal keys (see format.h).
+  // The returned iterator should be deleted when no longer needed.
+  Iterator* TEST_NewInternalIterator();
+
+ private:
+  friend class DB;
+
+  Iterator* NewInternalIterator(const ReadOptions&,
+                                SequenceNumber* latest_snapshot);
+
+  Status NewDB();
+
+  // Recover the descriptor from persistent storage.  May do a significant
+  // amount of work to recover recently logged updates.  Any changes to
+  // be made to the descriptor are added to *edit.
+  Status Recover(VersionEdit* edit);
+
+  // Apply the specified updates and save the resulting descriptor to
+  // persistent storage.  If cleanup_mem is non-NULL, arrange to
+  // delete it when all existing snapshots have gone away iff Install()
+  // returns OK.
+  Status Install(VersionEdit* edit,
+                 uint64_t new_log_number,
+                 MemTable* cleanup_mem);
+
+  void MaybeIgnoreError(Status* s) const;
+
+  // Delete any unneeded files and stale in-memory entries.
+  void DeleteObsoleteFiles();
+
+  // Called when an iterator over a particular version of the
+  // descriptor goes away.
+  static void Unref(void* arg1, void* arg2);
+
+  // Compact the in-memory write buffer to disk.  Switches to a new
+  // log-file/memtable and writes a new descriptor iff successful.
+  Status CompactMemTable();
+
+  Status RecoverLogFile(uint64_t log_number,
+                        VersionEdit* edit,
+                        SequenceNumber* max_sequence);
+
+  Status WriteLevel0Table(MemTable* mem, VersionEdit* edit);
+
+  bool HasLargeValues(const WriteBatch& batch) const;
+
+  // Process data in "*updates" and return a status.  "assigned_seq"
+  // is the sequence number assigned to the first mod in "*updates".
+  // If no large values are encountered, "*final" is set to "updates".
+  // If large values were encountered, registers the references of the
+  // large values with the VersionSet, writes the large values to
+  // files (if appropriate), and allocates a new WriteBatch with the
+  // large values replaced with indirect references and stores a
+  // pointer to the new WriteBatch in *final.  If *final != updates on
+  // return, then the client should delete *final when no longer
+  // needed.  Returns OK on success, and an appropriate error
+  // otherwise.
+  Status HandleLargeValues(SequenceNumber assigned_seq,
+                           WriteBatch* updates,
+                           WriteBatch** final);
+
+  // Helper routine for HandleLargeValues
+  void MaybeCompressLargeValue(
+      const Slice& raw_value,
+      Slice* file_bytes,
+      std::string* scratch,
+      LargeValueRef* ref);
+
+  struct CompactionState;
+
+  void MaybeScheduleCompaction();
+  static void BGWork(void* db);
+  void BackgroundCall();
+  void BackgroundCompaction();
+  void CleanupCompaction(CompactionState* compact);
+  Status DoCompactionWork(CompactionState* compact);
+
+  Status OpenCompactionOutputFile(CompactionState* compact);
+  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
+  Status InstallCompactionResults(CompactionState* compact);
+
+  // Constant after construction
+  Env* const env_;
+  const InternalKeyComparator internal_comparator_;
+  const Options options_;  // options_.comparator == &internal_comparator_
+  bool owns_info_log_;
+  const std::string dbname_;
+
+  // table_cache_ provides its own synchronization
+  TableCache* table_cache_;
+
+  // Lock over the persistent DB state.  Non-NULL iff successfully acquired.
+  FileLock* db_lock_;
+
+  // State below is protected by mutex_
+  port::Mutex mutex_;
+  port::AtomicPointer shutting_down_;
+  port::CondVar bg_cv_;         // Signalled when !bg_compaction_scheduled_
+  port::CondVar compacting_cv_;  // Signalled when !compacting_
+  SequenceNumber last_sequence_;
+  MemTable* mem_;
+  WritableFile* logfile_;
+  log::Writer* log_;
+  uint64_t log_number_;
+  SnapshotList snapshots_;
+
+  // Set of table files to protect from deletion because they are
+  // part of ongoing compactions.
+  std::set<uint64_t> pending_outputs_;
+
+  // Has a background compaction been scheduled or is running?
+  bool bg_compaction_scheduled_;
+
+  // Is there a compaction running?
+  bool compacting_;
+
+  VersionSet* versions_;
+
+  // Have we encountered a background error in paranoid mode?
+  Status bg_error_;
+
+  // No copying allowed
+  DBImpl(const DBImpl&);
+  void operator=(const DBImpl&);
+
+  const Comparator* user_comparator() const {
+    return internal_comparator_.user_comparator();
+  }
+};
+
+// Sanitize db options.  The caller should delete result.info_log if
+// it is not equal to src.info_log.
+extern Options SanitizeOptions(const std::string& db,
+                               const InternalKeyComparator* icmp,
+                               const Options& src);
+
+}
+
+#endif  // STORAGE_LEVELDB_DB_DB_IMPL_H_
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -0,0 +1,412 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "include/env.h"
+#include "include/iterator.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+
+namespace leveldb {
+
+#if 0
+static void DumpInternalIter(Iterator* iter) {
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey k;
+    if (!ParseInternalKey(iter->key(), &k)) {
+      fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
+    } else {
+      fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
+    }
+  }
+}
+#endif
+
+namespace {
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter: public Iterator {
+ public:
+  DBIter(const std::string* dbname, Env* env,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s)
+      : dbname_(dbname),
+        env_(env),
+        user_comparator_(cmp),
+        iter_(iter),
+        sequence_(s),
+        large_(NULL),
+        valid_(false) {
+  }
+  virtual ~DBIter() {
+    delete iter_;
+    delete large_;
+  }
+  virtual bool Valid() const { return valid_; }
+  virtual Slice key() const {
+    assert(valid_);
+    return key_;
+  }
+  virtual Slice value() const {
+    assert(valid_);
+    if (large_ == NULL) {
+      return value_;
+    } else {
+      MutexLock l(&large_->mutex);
+      if (!large_->produced) {
+        ReadIndirectValue();
+      }
+      return large_->value;
+    }
+  }
+
+  virtual void Next() {
+    assert(valid_);
+    // iter_ is already positioned past DBIter::key()
+    FindNextUserEntry();
+  }
+
+  virtual void Prev() {
+    assert(valid_);
+    bool ignored;
+    ScanUntilBeforeCurrentKey(&ignored);
+    FindPrevUserEntry();
+  }
+
+  virtual void Seek(const Slice& target) {
+    ParsedInternalKey ikey(target, sequence_, kValueTypeForSeek);
+    std::string tmp;
+    AppendInternalKey(&tmp, ikey);
+    iter_->Seek(tmp);
+    FindNextUserEntry();
+  }
+  virtual void SeekToFirst() {
+    iter_->SeekToFirst();
+    FindNextUserEntry();
+  }
+
+  virtual void SeekToLast();
+
+  virtual Status status() const {
+    if (status_.ok()) {
+      if (large_ != NULL && !large_->status.ok()) return large_->status;
+      return iter_->status();
+    } else {
+      return status_;
+    }
+  }
+
+ private:
+  void FindNextUserEntry();
+  void FindPrevUserEntry();
+  void SaveKey(const Slice& k) { key_.assign(k.data(), k.size()); }
+  void SaveValue(const Slice& v) {
+    if (value_.capacity() > v.size() + 1048576) {
+      std::string empty;
+      swap(empty, value_);
+    }
+    value_.assign(v.data(), v.size());
+  }
+  bool ParseKey(ParsedInternalKey* key);
+  void SkipPast(const Slice& k);
+  void ScanUntilBeforeCurrentKey(bool* found_live);
+
+  void ReadIndirectValue() const;
+
+  struct Large {
+    port::Mutex mutex;
+    std::string value;
+    bool produced;
+    Status status;
+  };
+
+  const std::string* const dbname_;
+  Env* const env_;
+
+  const Comparator* const user_comparator_;
+
+  // iter_ is positioned just past current entry for DBIter if valid_
+  Iterator* const iter_;
+
+  SequenceNumber const sequence_;
+  Status status_;
+  std::string key_;                  // Always a user key
+  std::string value_;
+  Large* large_;      // Non-NULL if value is an indirect reference
+  bool valid_;
+
+  // No copying allowed
+  DBIter(const DBIter&);
+  void operator=(const DBIter&);
+};
+
+inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+  if (!ParseInternalKey(iter_->key(), ikey)) {
+    status_ = Status::Corruption("corrupted internal key in DBIter");
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void DBIter::FindNextUserEntry() {
+  if (large_ != NULL) {
+    if (status_.ok() && !large_->status.ok()) {
+      status_ = large_->status;
+    }
+    delete large_;
+    large_ = NULL;
+  }
+  while (iter_->Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      // Skip past corrupted entry
+      iter_->Next();
+      continue;
+    }
+    if (ikey.sequence > sequence_) {
+      // Ignore entries newer than the snapshot
+      iter_->Next();
+      continue;
+    }
+
+    switch (ikey.type) {
+      case kTypeDeletion:
+        SaveKey(ikey.user_key);  // Make local copy for use by SkipPast()
+        iter_->Next();
+        SkipPast(key_);
+        // Do not return deleted entries.  Instead keep looping.
+        break;
+
+      case kTypeValue:
+        SaveKey(ikey.user_key);
+        SaveValue(iter_->value());
+        iter_->Next();
+        SkipPast(key_);
+        // Yield the value we just found.
+        valid_ = true;
+        return;
+
+      case kTypeLargeValueRef:
+        SaveKey(ikey.user_key);
+        // Save the large value ref as value_, and read it lazily on a call
+        // to value()
+        SaveValue(iter_->value());
+        large_ = new Large;
+        large_->produced = false;
+        iter_->Next();
+        SkipPast(key_);
+        // Yield the value we just found.
+        valid_ = true;
+        return;
+    }
+  }
+  valid_ = false;
+  key_.clear();
+  value_.clear();
+  assert(large_ == NULL);
+}
+
+void DBIter::SkipPast(const Slice& k) {
+  while (iter_->Valid()) {
+    ParsedInternalKey ikey;
+    // Note that if we cannot parse an internal key, we keep looping
+    // so that if we have a run like the following:
+    //     <x,100,v> => value100
+    //     <corrupted entry for user key x>
+    //     <x,50,v> => value50
+    // we will skip over the corrupted entry as well as value50.
+    if (ParseKey(&ikey) && user_comparator_->Compare(ikey.user_key, k) != 0) {
+      break;
+    }
+    iter_->Next();
+  }
+}
+
+void DBIter::SeekToLast() {
+  // Position iter_ at the last uncorrupted user key and then
+  // let FindPrevUserEntry() do the heavy lifting to find
+  // a user key that is live.
+  iter_->SeekToLast();
+  ParsedInternalKey current;
+  while (iter_->Valid() && !ParseKey(&current)) {
+    iter_->Prev();
+  }
+  if (iter_->Valid()) {
+    SaveKey(current.user_key);
+  }
+  FindPrevUserEntry();
+}
+
+// Let X be the user key at which iter_ is currently positioned.
+// Adjust DBIter to point at the last entry with a key <= X that
+// has a live value.
+void DBIter::FindPrevUserEntry() {
+  // Consider the following example:
+  //
+  //     A@540
+  //     A@400
+  //
+  //     B@300
+  //     B@200
+  //     B@100        <- iter_
+  //
+  //     C@301
+  //     C@201
+  //
+  // The comments marked "(first iteration)" below relate what happens
+  // for the preceding example in the first iteration of the while loop
+  // below.  There may be more than one iteration either if there are
+  // no live values for B, or if there is a corruption.
+  while (iter_->Valid()) {
+    std::string saved = key_;
+    bool found_live;
+    ScanUntilBeforeCurrentKey(&found_live);
+    // (first iteration) iter_ at A@400
+    if (found_live) {
+      // Step forward into range of entries with user key >= saved
+      if (!iter_->Valid()) {
+        iter_->SeekToFirst();
+      } else {
+        iter_->Next();
+      }
+      // (first iteration) iter_ at B@300
+
+      FindNextUserEntry();  // Sets key_ to the key of the next value it found
+      if (valid_ && user_comparator_->Compare(key_, saved) == 0) {
+        // (first iteration) iter_ at C@301
+        return;
+      }
+
+      // FindNextUserEntry() could not find any entries under the
+      // user key "saved".  This is probably a corruption since
+      // ScanUntilBefore(saved) found a live value.  So we skip
+      // backwards to an earlier key and ignore the corrupted
+      // entries for "saved".
+      //
+      // (first iteration) iter_ at C@301 and saved == "B"
+      key_ = saved;
+      bool ignored;
+      ScanUntilBeforeCurrentKey(&ignored);
+      // (first iteration) iter_ at A@400
+    }
+  }
+  valid_ = false;
+  key_.clear();
+  value_.clear();
+}
+
+void DBIter::ScanUntilBeforeCurrentKey(bool* found_live) {
+  *found_live = false;
+  if (!iter_->Valid()) {
+    iter_->SeekToLast();
+  }
+
+  while (iter_->Valid()) {
+    ParsedInternalKey current;
+    if (!ParseKey(&current)) {
+      iter_->Prev();
+      continue;
+    }
+
+    if (current.sequence > sequence_) {
+      // Ignore entries that are serialized after this read
+      iter_->Prev();
+      continue;
+    }
+
+    const int cmp = user_comparator_->Compare(current.user_key, key_);
+    if (cmp < 0) {
+      SaveKey(current.user_key);
+      return;
+    } else if (cmp == 0) {
+      switch (current.type) {
+        case kTypeDeletion:
+          *found_live = false;
+          break;
+
+        case kTypeValue:
+        case kTypeLargeValueRef:
+          *found_live = true;
+          break;
+      }
+    } else {  // cmp > 0
+      *found_live = false;
+    }
+
+    iter_->Prev();
+  }
+}
+
+void DBIter::ReadIndirectValue() const {
+  assert(!large_->produced);
+  large_->produced = true;
+  LargeValueRef large_ref;
+  if (value_.size() != LargeValueRef::ByteSize()) {
+    large_->status = Status::Corruption("malformed large value reference");
+    return;
+  }
+  memcpy(large_ref.data, value_.data(), LargeValueRef::ByteSize());
+  std::string fname = LargeValueFileName(*dbname_, large_ref);
+  RandomAccessFile* file;
+  Status s = env_->NewRandomAccessFile(fname, &file);
+  if (s.ok()) {
+    uint64_t file_size = file->Size();
+    uint64_t value_size = large_ref.ValueSize();
+    large_->value.resize(value_size);
+    Slice result;
+    s = file->Read(0, file_size, &result,
+                   const_cast<char*>(large_->value.data()));
+    if (s.ok()) {
+      if (result.size() == file_size) {
+        switch (large_ref.compression_type()) {
+          case kNoCompression: {
+            if (result.data() != large_->value.data()) {
+              large_->value.assign(result.data(), result.size());
+            }
+            break;
+          }
+          case kLightweightCompression: {
+            std::string uncompressed;
+            if (port::Lightweight_Uncompress(result.data(), result.size(),
+                                       &uncompressed) &&
+                uncompressed.size() == large_ref.ValueSize()) {
+              swap(uncompressed, large_->value);
+            } else {
+              s = Status::Corruption(
+                  "Unable to read entire compressed large value file");
+            }
+          }
+        }
+      } else {
+        s = Status::Corruption("Unable to read entire large value file");
+      }
+    }
+    delete file;        // Ignore errors on closing
+  }
+  if (!s.ok()) {
+    large_->value.clear();
+    large_->status = s;
+  }
+}
+
+}  // anonymous namespace
+
+Iterator* NewDBIterator(
+    const std::string* dbname,
+    Env* env,
+    const Comparator* user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence) {
+  return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
+}
+
+}
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_
+#define STORAGE_LEVELDB_DB_DB_ITER_H_
+
+#include <stdint.h>
+#include "include/db.h"
+#include "db/dbformat.h"
+
+namespace leveldb {
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified "sequence" number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+    const std::string* dbname,
+    Env* env,
+    const Comparator* user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence);
+
+}
+
+#endif  // STORAGE_LEVELDB_DB_DB_ITER_H_
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -0,0 +1,963 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/db.h"
+
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "include/env.h"
+#include "include/table.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+class DBTest {
+ public:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  Options last_options_;
+
+  DBTest() : env_(Env::Default()) {
+    dbname_ = test::TmpDir() + "/db_test";
+    DestroyDB(dbname_, Options());
+    db_ = NULL;
+    Reopen();
+  }
+
+  ~DBTest() {
+    delete db_;
+    DestroyDB(dbname_, Options());
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = NULL) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void DestroyAndReopen(Options* options = NULL) {
+    delete db_;
+    db_ = NULL;
+    DestroyDB(dbname_, Options());
+    ASSERT_OK(TryReopen(options));
+  }
+
+  Status TryReopen(Options* options) {
+    delete db_;
+    db_ = NULL;
+    Options opts;
+    if (options != NULL) {
+      opts = *options;
+    } else {
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const std::string& k, const std::string& v) {
+    WriteBatch batch;
+    batch.Put(k, v);
+    return db_->Write(WriteOptions(), &batch);
+  }
+
+  Status Delete(const std::string& k) {
+    WriteBatch batch;
+    batch.Delete(k);
+    return db_->Write(WriteOptions(), &batch);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  std::string AllEntriesFor(const Slice& user_key) {
+    Iterator* iter = dbfull()->TEST_NewInternalIterator();
+    InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+    iter->Seek(target.Encode());
+    std::string result;
+    if (!iter->status().ok()) {
+      result = iter->status().ToString();
+    } else {
+      result = "[ ";
+      bool first = true;
+      while (iter->Valid()) {
+        ParsedInternalKey ikey;
+        if (!ParseInternalKey(iter->key(), &ikey)) {
+          result += "CORRUPTED";
+        } else {
+          if (last_options_.comparator->Compare(
+                  ikey.user_key, user_key) != 0) {
+            break;
+          }
+          if (!first) {
+            result += ", ";
+          }
+          first = false;
+          switch (ikey.type) {
+            case kTypeValue:
+              result += iter->value().ToString();
+              break;
+            case kTypeLargeValueRef:
+              result += "LARGEVALUE(" + EscapeString(iter->value()) + ")";
+              break;
+            case kTypeDeletion:
+              result += "DEL";
+              break;
+          }
+        }
+        iter->Next();
+      }
+      if (!first) {
+        result += " ";
+      }
+      result += "]";
+    }
+    delete iter;
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level) {
+    uint64_t val;
+    ASSERT_TRUE(
+        db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level),
+                         &val));
+    return val;
+  }
+
+  uint64_t Size(const Slice& start, const Slice& limit) {
+    Range r(start, limit);
+    uint64_t size;
+    db_->GetApproximateSizes(&r, 1, &size);
+    return size;
+  }
+
+  std::set<LargeValueRef> LargeValueFiles() const {
+    // Return the set of large value files that exist in the database
+    std::vector<std::string> filenames;
+    env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
+    uint64_t number;
+    LargeValueRef large_ref;
+    FileType type;
+    std::set<LargeValueRef> live;
+    for (int i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &large_ref, &type) &&
+          type == kLargeValueFile) {
+        fprintf(stderr, "  live: %s\n",
+                LargeValueRefToFilenameString(large_ref).c_str());
+        live.insert(large_ref);
+      }
+    }
+    fprintf(stderr, "Found %d live large value files\n", (int)live.size());
+    return live;
+  }
+};
+
+TEST(DBTest, Empty) {
+  ASSERT_TRUE(db_ != NULL);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST(DBTest, ReadWrite) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+}
+
+TEST(DBTest, PutDeleteGet) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+  ASSERT_EQ("v2", Get("foo"));
+  ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST(DBTest, Recover) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("baz", "v5"));
+
+  Reopen();
+  ASSERT_EQ("v1", Get("foo"));
+
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_EQ("v5", Get("baz"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+
+  Reopen();
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_OK(Put("foo", "v4"));
+  ASSERT_EQ("v4", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  ASSERT_EQ("v5", Get("baz"));
+}
+
+TEST(DBTest, RecoveryWithEmptyLog) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("foo", "v2"));
+  Reopen();
+  Reopen();
+  ASSERT_OK(Put("foo", "v3"));
+  Reopen();
+  ASSERT_EQ("v3", Get("foo"));
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key%06d", i);
+  return std::string(buf);
+}
+
+TEST(DBTest, MinorCompactionsHappen) {
+  Options options;
+  options.write_buffer_size = 10000;
+  Reopen(&options);
+
+  const int N = 100;
+
+  int starting_num_tables = NumTableFilesAtLevel(0);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v')));
+  }
+  int ending_num_tables = NumTableFilesAtLevel(0);
+  ASSERT_GT(ending_num_tables, starting_num_tables);
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+  }
+
+  Reopen();
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+  }
+}
+
+TEST(DBTest, RecoverWithLargeLog) {
+  {
+    Options options;
+    options.large_value_threshold = 1048576;
+    Reopen(&options);
+    ASSERT_OK(Put("big1", std::string(200000, '1')));
+    ASSERT_OK(Put("big2", std::string(200000, '2')));
+    ASSERT_OK(Put("small3", std::string(10, '3')));
+    ASSERT_OK(Put("small4", std::string(10, '4')));
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  }
+
+  // Make sure that if we re-open with a small write buffer size that
+  // we flush table files in the middle of a large log file.
+  Options options;
+  options.write_buffer_size = 100000;
+  options.large_value_threshold = 1048576;
+  Reopen(&options);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+  ASSERT_EQ(std::string(200000, '1'), Get("big1"));
+  ASSERT_EQ(std::string(200000, '2'), Get("big2"));
+  ASSERT_EQ(std::string(10, '3'), Get("small3"));
+  ASSERT_EQ(std::string(10, '4'), Get("small4"));
+  ASSERT_GT(NumTableFilesAtLevel(0), 1);
+}
+
+TEST(DBTest, CompactionsGenerateMultipleFiles) {
+  Options options;
+  options.write_buffer_size = 100000000;        // Large write buffer
+  options.large_value_threshold = 1048576;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  // Write 8MB (80 values, each 100K)
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  std::vector<std::string> values;
+  for (int i = 0; i < 80; i++) {
+    values.push_back(RandomString(&rnd, 100000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+
+  // Reopening moves updates to level-0
+  Reopen(&options);
+  dbfull()->TEST_CompactRange(0, "", Key(100000));
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+  for (int i = 0; i < 80; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val),
+            (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+TEST(DBTest, ApproximateSizes) {
+  for (int test = 0; test < 2; test++) {
+    // test==0: default large_value_threshold
+    // test==1: 1 MB large_value_threshold
+    Options options;
+    options.large_value_threshold = (test == 0) ? 65536 : 1048576;
+    options.write_buffer_size = 100000000;        // Large write buffer
+    options.compression = kNoCompression;
+    DestroyAndReopen();
+
+    ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+    Reopen(&options);
+    ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    const int N = 80;
+    Random rnd(301);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000)));
+    }
+    if (test == 1) {
+      // 0 because GetApproximateSizes() does not account for memtable space for
+      // non-large values
+      ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
+    } else {
+      ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000));
+      ASSERT_TRUE(Between(Size(Key(20), Key(30)),
+                          100000*10, 100000*10 + 10000));
+    }
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      Reopen(&options);
+
+      for (int compact_start = 0; compact_start < N; compact_start += 10) {
+        for (int i = 0; i < N; i += 10) {
+          ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000));
+          ASSERT_TRUE(Between(Size("", Key(i)+".suffix"),
+                              100000 * (i+1), 100000 * (i+1) + 10000));
+          ASSERT_TRUE(Between(Size(Key(i), Key(i+10)),
+                              100000 * 10, 100000 * 10 + 10000));
+        }
+        ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000));
+        ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000));
+
+        dbfull()->TEST_CompactRange(0,
+                                    Key(compact_start),
+                                    Key(compact_start + 9));
+      }
+
+      ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+      ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    }
+  }
+}
+
+TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+  Options options;
+  options.large_value_threshold = 65536;
+  options.compression = kNoCompression;
+  Reopen();
+
+  Random rnd(301);
+  std::string big1 = RandomString(&rnd, 100000);
+  ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000)));
+  ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000)));
+  ASSERT_OK(Put(Key(2), big1));
+  ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000)));
+  ASSERT_OK(Put(Key(4), big1));
+  ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000)));
+  ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
+  ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
+
+  // Check sizes across recovery by reopening a few times
+  for (int run = 0; run < 3; run++) {
+    Reopen(&options);
+
+    ASSERT_TRUE(Between(Size("", Key(0)), 0, 0));
+    ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000));
+    ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000));
+    ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000));
+    ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000));
+    ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
+    ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
+    ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
+    ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000));
+
+    ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
+
+    dbfull()->TEST_CompactRange(0, Key(0), Key(100));
+  }
+}
+
+TEST(DBTest, IteratorPinsRef) {
+  Put("foo", "hello");
+
+  // Get iterator that will yield the current contents of the DB.
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  // Write to force compactions
+  Put("foo", "newvalue1");
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values
+  }
+  Put("foo", "newvalue2");
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("hello", iter->value().ToString());
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+  delete iter;
+}
+
+TEST(DBTest, Snapshot) {
+  Put("foo", "v1");
+  const Snapshot* s1 = db_->GetSnapshot();
+  Put("foo", "v2");
+  const Snapshot* s2 = db_->GetSnapshot();
+  Put("foo", "v3");
+  const Snapshot* s3 = db_->GetSnapshot();
+
+  Put("foo", "v4");
+  ASSERT_EQ("v1", Get("foo", s1));
+  ASSERT_EQ("v2", Get("foo", s2));
+  ASSERT_EQ("v3", Get("foo", s3));
+  ASSERT_EQ("v4", Get("foo"));
+
+  db_->ReleaseSnapshot(s3);
+  ASSERT_EQ("v1", Get("foo", s1));
+  ASSERT_EQ("v2", Get("foo", s2));
+  ASSERT_EQ("v4", Get("foo"));
+
+  db_->ReleaseSnapshot(s1);
+  ASSERT_EQ("v2", Get("foo", s2));
+  ASSERT_EQ("v4", Get("foo"));
+
+  db_->ReleaseSnapshot(s2);
+  ASSERT_EQ("v4", Get("foo"));
+}
+
+TEST(DBTest, HiddenValuesAreRemoved) {
+  Random rnd(301);
+  std::string big = RandomString(&rnd, 50000);
+  Put("foo", big);
+  Put("pastfoo", "v");
+  const Snapshot* snapshot = db_->GetSnapshot();
+  Put("foo", "tiny");
+  Put("pastfoo2", "v2");        // Advance sequence number one more
+
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+
+  ASSERT_EQ(big, Get("foo", snapshot));
+  ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
+  dbfull()->TEST_CompactRange(0, "", "x");
+  ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_GE(NumTableFilesAtLevel(1), 1);
+  dbfull()->TEST_CompactRange(1, "", "x");
+  ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+
+  ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
+}
+
+TEST(DBTest, DeletionMarkers1) {
+  Put("foo", "v1");
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  dbfull()->TEST_CompactRange(0, "", "z");
+  dbfull()->TEST_CompactRange(1, "", "z");
+  ASSERT_EQ(NumTableFilesAtLevel(2), 1);   // foo => v1 is now in level 2 file
+  Delete("foo");
+  Put("foo", "v2");
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+  dbfull()->TEST_CompactRange(0, "", "z");
+  // DEL eliminated, but v1 remains because we aren't compacting that level
+  // (DEL can be eliminated because v2 hides v1).
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
+  dbfull()->TEST_CompactRange(1, "", "z");
+  // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed.
+  // (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
+}
+
+TEST(DBTest, DeletionMarkers2) {
+  Put("foo", "v1");
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  dbfull()->TEST_CompactRange(0, "", "z");
+  dbfull()->TEST_CompactRange(1, "", "z");
+  ASSERT_EQ(NumTableFilesAtLevel(2), 1);   // foo => v1 is now in level 2 file
+  Delete("foo");
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(0, "", "z");
+  // DEL kept: L2 file overlaps
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(1, "", "z");
+  // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed.
+  // (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
+}
+
+TEST(DBTest, ComparatorCheck) {
+  class NewComparator : public Comparator {
+   public:
+    virtual const char* Name() const { return "leveldb.NewComparator"; }
+    virtual int Compare(const Slice& a, const Slice& b) const {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    virtual void FindShortSuccessor(std::string* key) const {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  NewComparator cmp;
+  Options new_options;
+  new_options.comparator = &cmp;
+  Status s = TryReopen(&new_options);
+  ASSERT_TRUE(!s.ok());
+  ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+      << s.ToString();
+}
+
+static bool LargeValuesOK(DBTest* db,
+                          const std::set<LargeValueRef>& expected) {
+  std::set<LargeValueRef> actual = db->LargeValueFiles();
+  if (actual.size() != expected.size()) {
+    fprintf(stderr, "Sets differ in size: %d vs %d\n",
+            (int)actual.size(), (int)expected.size());
+    return false;
+  }
+  for (std::set<LargeValueRef>::const_iterator it = expected.begin();
+       it != expected.end();
+       ++it) {
+    if (actual.count(*it) != 1) {
+      fprintf(stderr, "  key '%s' not found in actual set\n",
+              LargeValueRefToFilenameString(*it).c_str());
+      return false;
+    }
+  }
+  return true;
+}
+
+TEST(DBTest, LargeValues1) {
+  Options options;
+  options.large_value_threshold = 10000;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  std::string big1;
+  test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible
+  std::set<LargeValueRef> expected;
+
+  ASSERT_OK(Put("big1", big1));
+  expected.insert(LargeValueRef::Make(big1, kNoCompression));
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+
+  ASSERT_OK(Delete("big1"));
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  // No handling of deletion markers on memtable compactions, so big1 remains
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+
+  dbfull()->TEST_CompactRange(0, "", "z");
+  expected.erase(LargeValueRef::Make(big1, kNoCompression));
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+}
+
+TEST(DBTest, LargeValues2) {
+  Options options;
+  options.large_value_threshold = 10000;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  std::string big1, big2;
+  test::CompressibleString(&rnd, 1.0, 20000, &big1);  // Not compressible
+  test::CompressibleString(&rnd, 0.6, 40000, &big2);  // Compressible
+  std::set<LargeValueRef> expected;
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+
+  ASSERT_OK(Put("big1", big1));
+  expected.insert(LargeValueRef::Make(big1, kNoCompression));
+  ASSERT_EQ(big1, Get("big1"));
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+
+  ASSERT_OK(Put("big2", big2));
+  ASSERT_EQ(big2, Get("big2"));
+#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM)
+  // TODO(sanjay) Reenable after compression support is added
+  expected.insert(LargeValueRef::Make(big2, kNoCompression));
+#else
+  expected.insert(LargeValueRef::Make(big2, kLightweightCompression));
+#endif
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+
+  dbfull()->TEST_CompactRange(0, "", "z");
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+
+  ASSERT_OK(Put("big2", big2));
+  ASSERT_OK(Put("big2_b", big2));
+  ASSERT_EQ(big1, Get("big1"));
+  ASSERT_EQ(big2, Get("big2"));
+  ASSERT_EQ(big2, Get("big2_b"));
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+
+  ASSERT_OK(Delete("big1"));
+  ASSERT_EQ("NOT_FOUND", Get("big1"));
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+  dbfull()->TEST_CompactRange(0, "", "z");
+  expected.erase(LargeValueRef::Make(big1, kNoCompression));
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+  dbfull()->TEST_CompactRange(1, "", "z");
+
+  ASSERT_OK(Delete("big2"));
+  ASSERT_EQ("NOT_FOUND", Get("big2"));
+  ASSERT_EQ(big2, Get("big2_b"));
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+  dbfull()->TEST_CompactRange(0, "", "z");
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+
+  // Make sure the large value refs survive a reload and compactions after
+  // the reload.
+  Reopen();
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  dbfull()->TEST_CompactRange(0, "", "z");
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+}
+
+TEST(DBTest, LargeValues3) {
+  // Make sure we don't compress values if
+  Options options;
+  options.large_value_threshold = 10000;
+  options.compression = kNoCompression;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  std::string big1 = std::string(100000, 'x');       // Very compressible
+  std::set<LargeValueRef> expected;
+
+  ASSERT_OK(Put("big1", big1));
+  ASSERT_EQ(big1, Get("big1"));
+  expected.insert(LargeValueRef::Make(big1, kNoCompression));
+  ASSERT_TRUE(LargeValuesOK(this, expected));
+}
+
+
+TEST(DBTest, DBOpen_Options) {
+  std::string dbname = test::TmpDir() + "/db_options_test";
+  DestroyDB(dbname, Options());
+
+  // Does not exist, and create_if_missing == false: error
+  DB* db = NULL;
+  Options opts;
+  opts.create_if_missing = false;
+  Status s = DB::Open(opts, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL);
+  ASSERT_TRUE(db == NULL);
+
+  // Does not exist, and create_if_missing == true: OK
+  opts.create_if_missing = true;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != NULL);
+
+  delete db;
+  db = NULL;
+
+  // Does exist, and error_if_exists == true: error
+  opts.create_if_missing = false;
+  opts.error_if_exists = true;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL);
+  ASSERT_TRUE(db == NULL);
+
+  // Does exist, and error_if_exists == false: OK
+  opts.create_if_missing = true;
+  opts.error_if_exists = false;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != NULL);
+
+  delete db;
+  db = NULL;
+}
+
+class ModelDB: public DB {
+ public:
+  explicit ModelDB(const Options& options): options_(options) { }
+  ~ModelDB() { }
+  virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
+    return DB::Put(o, k, v);
+  }
+  virtual Status Delete(const WriteOptions& o, const Slice& key) {
+    return DB::Delete(o, key);
+  }
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key, std::string* value) {
+    assert(false);      // Not implemented
+    return Status::NotFound(key);
+  }
+  virtual Iterator* NewIterator(const ReadOptions& options) {
+    if (options.snapshot == NULL) {
+      KVMap* saved = new KVMap;
+      *saved = map_;
+      return new ModelIter(saved, true);
+    } else {
+      const KVMap* snapshot_state =
+          reinterpret_cast<const KVMap*>(options.snapshot->number_);
+      return new ModelIter(snapshot_state, false);
+    }
+  }
+  virtual const Snapshot* GetSnapshot() {
+    KVMap* saved = new KVMap;
+    *saved = map_;
+    return snapshots_.New(
+        reinterpret_cast<SequenceNumber>(saved));
+  }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) {
+    const KVMap* saved = reinterpret_cast<const KVMap*>(snapshot->number_);
+    delete saved;
+    snapshots_.Delete(snapshot);
+  }
+  virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
+    assert(options.post_write_snapshot == NULL);   // Not supported
+    for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) {
+      switch (it.op()) {
+        case kTypeValue:
+          map_[it.key().ToString()] = it.value().ToString();
+          break;
+        case kTypeLargeValueRef:
+          assert(false);        // Should not occur
+          break;
+        case kTypeDeletion:
+          map_.erase(it.key().ToString());
+          break;
+      }
+    }
+    return Status::OK();
+  }
+
+  virtual bool GetProperty(const Slice& property, uint64_t* value) {
+    return false;
+  }
+  virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
+    for (int i = 0; i < n; i++) {
+      sizes[i] = 0;
+    }
+  }
+ private:
+  typedef std::map<std::string, std::string> KVMap;
+  class ModelIter: public Iterator {
+   public:
+    ModelIter(const KVMap* map, bool owned)
+        : map_(map), owned_(owned), iter_(map_->end()) {
+    }
+    ~ModelIter() {
+      if (owned_) delete map_;
+    }
+    virtual bool Valid() const { return iter_ != map_->end(); }
+    virtual void SeekToFirst() { iter_ = map_->begin(); }
+    virtual void SeekToLast() {
+      if (map_->empty()) {
+        iter_ = map_->end();
+      } else {
+        iter_ = map_->find(map_->rbegin()->first);
+      }
+    }
+    virtual void Seek(const Slice& k) {
+      iter_ = map_->lower_bound(k.ToString());
+    }
+    virtual void Next() { ++iter_; }
+    virtual void Prev() { --iter_; }
+    virtual Slice key() const { return iter_->first; }
+    virtual Slice value() const { return iter_->second; }
+    virtual Status status() const { return Status::OK(); }
+   private:
+    const KVMap* const map_;
+    const bool owned_;  // Do we own map_
+    KVMap::const_iterator iter_;
+  };
+  const Options options_;
+  KVMap map_;
+  SnapshotList snapshots_;
+};
+
+static std::string RandomKey(Random* rnd) {
+  int len = (rnd->OneIn(3)
+             ? 1                // Short sometimes to encourage collisions
+             : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+  return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step,
+                             DB* model,
+                             DB* db,
+                             const Snapshot* model_snap,
+                             const Snapshot* db_snap) {
+  ReadOptions options;
+  options.snapshot = model_snap;
+  Iterator* miter = model->NewIterator(options);
+  options.snapshot = db_snap;
+  Iterator* dbiter = db->NewIterator(options);
+  bool ok = true;
+  int count = 0;
+  for (miter->SeekToFirst(), dbiter->SeekToFirst();
+       ok && miter->Valid() && dbiter->Valid();
+       miter->Next(), dbiter->Next()) {
+    count++;
+    if (miter->key().compare(dbiter->key()) != 0) {
+      fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
+              step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(dbiter->key()).c_str());
+      ok = false;
+      break;
+    }
+
+    if (miter->value().compare(dbiter->value()) != 0) {
+      fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+              step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(miter->value()).c_str(),
+              EscapeString(miter->value()).c_str());
+      ok = false;
+    }
+  }
+
+  if (ok) {
+    if (miter->Valid() != dbiter->Valid()) {
+      fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+              step, miter->Valid(), dbiter->Valid());
+      ok = false;
+    }
+  }
+  fprintf(stderr, "%d entries compared: ok=%d\n", count, ok);
+  delete miter;
+  delete dbiter;
+  return ok;
+}
+
+TEST(DBTest, Randomized) {
+  Random rnd(test::RandomSeed());
+  ModelDB model(last_options_);
+  const int N = 10000;
+  const Snapshot* model_snap = NULL;
+  const Snapshot* db_snap = NULL;
+  std::string k, v;
+  for (int step = 0; step < N; step++) {
+    if (step % 100 == 0) {
+      fprintf(stderr, "Step %d of %d\n", step, N);
+    }
+    int p = rnd.Uniform(100);
+    if (p < 45) {                               // Put
+      k = RandomKey(&rnd);
+      v = RandomString(&rnd,
+                       rnd.OneIn(20)
+                       ? 100 + rnd.Uniform(100)
+                       : rnd.Uniform(8));
+      ASSERT_OK(model.Put(WriteOptions(), k, v));
+      ASSERT_OK(db_->Put(WriteOptions(), k, v));
+
+    } else if (p < 90) {                        // Delete
+      k = RandomKey(&rnd);
+      ASSERT_OK(model.Delete(WriteOptions(), k));
+      ASSERT_OK(db_->Delete(WriteOptions(), k));
+
+
+    } else {                                    // Multi-element batch
+      WriteBatch b;
+      const int num = rnd.Uniform(8);
+      for (int i = 0; i < num; i++) {
+        if (i == 0 || !rnd.OneIn(10)) {
+          k = RandomKey(&rnd);
+        } else {
+          // Periodically re-use the same key from the previous iter, so
+          // we have multiple entries in the write batch for the same key
+        }
+        if (rnd.OneIn(2)) {
+          v = RandomString(&rnd, rnd.Uniform(10));
+          b.Put(k, v);
+        } else {
+          b.Delete(k);
+        }
+      }
+      ASSERT_OK(model.Write(WriteOptions(), &b));
+      ASSERT_OK(db_->Write(WriteOptions(), &b));
+    }
+
+    if ((step % 100) == 0) {
+      ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
+      ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+      // Save a snapshot from each DB this time that we'll use next
+      // time we compare things, to make sure the current state is
+      // preserved with the snapshot
+      if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
+      if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
+
+      Reopen();
+      ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
+
+      model_snap = model.GetSnapshot();
+      db_snap = db_->GetSnapshot();
+    }
+  }
+  if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
+  if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
+}
+
+}
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -0,0 +1,152 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+  assert(seq <= kMaxSequenceNumber);
+  assert(t <= kValueTypeForSeek);
+  return (seq << 8) | t;
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+  result->append(key.user_key.data(), key.user_key.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+std::string ParsedInternalKey::DebugString() const {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "' @ %llu : %d",
+           (unsigned long long) sequence,
+           int(type));
+  std::string result = "'";
+  result += user_key.ToString();
+  result += buf;
+  return result;
+}
+
+const char* InternalKeyComparator::Name() const {
+  return "leveldb.InternalKeyComparator";
+}
+
+int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  if (r == 0) {
+    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+  // Attempt to shorten the user portion of the key
+  Slice user_start = ExtractUserKey(*start);
+  Slice user_limit = ExtractUserKey(limit);
+  std::string tmp(user_start.data(), user_start.size());
+  user_comparator_->FindShortestSeparator(&tmp, user_limit);
+  if (user_comparator_->Compare(*start, tmp) < 0) {
+    // User key has become larger.  Tack on the earliest possible
+    // number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*start, tmp) < 0);
+    assert(this->Compare(tmp, limit) < 0);
+    start->swap(tmp);
+  }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+  Slice user_key = ExtractUserKey(*key);
+  std::string tmp(user_key.data(), user_key.size());
+  user_comparator_->FindShortSuccessor(&tmp);
+  if (user_comparator_->Compare(user_key, tmp) < 0) {
+    // User key has become larger.  Tack on the earliest possible
+    // number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*key, tmp) < 0);
+    key->swap(tmp);
+  }
+}
+
+LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) {
+  LargeValueRef result;
+  port::SHA1_Hash(value.data(), value.size(), &result.data[0]);
+  EncodeFixed64(&result.data[20], value.size());
+  result.data[28] = static_cast<unsigned char>(ctype);
+  return result;
+}
+
+std::string LargeValueRefToFilenameString(const LargeValueRef& h) {
+  assert(sizeof(h.data) == LargeValueRef::ByteSize());
+  assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf
+  static const char tohex[] = "0123456789abcdef";
+  char buf[20*2];
+  for (int i = 0; i < 20; i++) {
+    buf[2*i] = tohex[(h.data[i] >> 4) & 0xf];
+    buf[2*i+1] = tohex[h.data[i] & 0xf];
+  }
+  std::string result = std::string(buf, sizeof(buf));
+  result += "-";
+  result += NumberToString(h.ValueSize());
+  result += "-";
+  result += NumberToString(static_cast<uint64_t>(h.compression_type()));
+  return result;
+}
+
+static uint32_t hexvalue(char c) {
+  if (c >= '0' && c <= '9') {
+    return c - '0';
+  } else if (c >= 'A' && c <= 'F') {
+    return 10 + c - 'A';
+  } else {
+    assert(c >= 'a' && c <= 'f');
+    return 10 + c - 'a';
+  }
+}
+
+bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) {
+  Slice in = s;
+  if (in.size() < 40) {
+    return false;
+  }
+  for (int i = 0; i < 20; i++) {
+    if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) {
+      return false;
+    }
+    unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]);
+    h->data[i] = c;
+  }
+  in.remove_prefix(40);
+  uint64_t value_size, ctype;
+
+  if (ConsumeChar(&in, '-') &&
+      ConsumeDecimalNumber(&in, &value_size) &&
+      ConsumeChar(&in, '-') &&
+      ConsumeDecimalNumber(&in, &ctype) &&
+      in.empty() &&
+      (ctype <= kLightweightCompression)) {
+    EncodeFixed64(&h->data[20], value_size);
+    h->data[28] = static_cast<unsigned char>(ctype);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
+#define STORAGE_LEVELDB_DB_FORMAT_H_
+
+#include <stdio.h>
+#include "include/comparator.h"
+#include "include/db.h"
+#include "include/slice.h"
+#include "include/table_builder.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+enum ValueType {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1,
+  kTypeLargeValueRef = 0x2,
+};
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+static const ValueType kValueTypeForSeek = kTypeLargeValueRef;
+
+typedef uint64_t SequenceNumber;
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber =
+    ((0x1ull << 56) - 1);
+
+struct ParsedInternalKey {
+  Slice user_key;
+  SequenceNumber sequence;
+  ValueType type;
+
+  ParsedInternalKey() { }  // Intentionally left uninitialized (for speed)
+  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+      : user_key(u), sequence(seq), type(t) { }
+  std::string DebugString() const;
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + 8;
+}
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+                              const ParsedInternalKey& key);
+
+// Attempt to parse an internal key from "internal_key".  On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  const size_t n = internal_key.size();
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator : public Comparator {
+ private:
+  const Comparator* user_comparator_;
+ public:
+  explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { }
+  virtual const char* Name() const;
+  virtual int Compare(const Slice& a, const Slice& b) const;
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const;
+  virtual void FindShortSuccessor(std::string* key) const;
+
+  const Comparator* user_comparator() const { return user_comparator_; }
+
+  int Compare(const InternalKey& a, const InternalKey& b) const;
+};
+
+// Modules in this directory should keep internal keys wrapped inside
+// the following class instead of plain strings so that we do not
+// incorrectly use string comparisons instead of an InternalKeyComparator.
+class InternalKey {
+ private:
+  std::string rep_;
+ public:
+  InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
+  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  }
+
+  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+  Slice Encode() const {
+    assert(!rep_.empty());
+    return rep_;
+  }
+
+  Slice user_key() const { return ExtractUserKey(rep_); }
+
+  void SetFrom(const ParsedInternalKey& p) {
+    rep_.clear();
+    AppendInternalKey(&rep_, p);
+  }
+
+  void Clear() { rep_.clear(); }
+};
+
+inline int InternalKeyComparator::Compare(
+    const InternalKey& a, const InternalKey& b) const {
+  return Compare(a.Encode(), b.Encode());
+}
+
+// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte
+// uncompressed size, and a 1 byte CompressionType code.  An
+// encoded form of it is embedded in the filenames of large value
+// files stored in the database, and the raw binary form is stored as
+// the iter->value() result for values of type kTypeLargeValueRef in
+// the table and log files that make up the database.
+struct LargeValueRef {
+  char data[29];
+
+  // Initialize a large value ref for the given data
+  static LargeValueRef Make(const Slice& data,
+                            CompressionType compression_type);
+
+  // Initialize a large value ref from a serialized, 29-byte reference value
+  static LargeValueRef FromRef(const Slice& ref) {
+    LargeValueRef result;
+    assert(ref.size() == sizeof(result.data));
+    memcpy(result.data, ref.data(), sizeof(result.data));
+    return result;
+  }
+
+  // Return the number of bytes in a LargeValueRef (not the
+  // number of bytes in the value referenced).
+  static size_t ByteSize() { return sizeof(LargeValueRef().data); }
+
+  // Return the number of bytes in the value referenced by "*this".
+  uint64_t ValueSize() const { return DecodeFixed64(&data[20]); }
+
+  CompressionType compression_type() const {
+    return static_cast<CompressionType>(data[28]);
+  }
+
+  bool operator==(const LargeValueRef& b) const {
+    return memcmp(data, b.data, sizeof(data)) == 0;
+  }
+  bool operator<(const LargeValueRef& b) const {
+    return memcmp(data, b.data, sizeof(data)) < 0;
+  }
+};
+
+// Convert the large value ref to a human-readable string suitable
+// for embedding in a large value filename.
+extern std::string LargeValueRefToFilenameString(const LargeValueRef& h);
+
+// Parse the large value filename string in "input" and store it in
+// "*h".  If successful, returns true.  Otherwise returns false.
+extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref);
+
+inline bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result) {
+  const size_t n = internal_key.size();
+  if (n < 8) return false;
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  result->sequence = num >> 8;
+  result->type = static_cast<ValueType>(c);
+  result->user_key = Slice(internal_key.data(), n - 8);
+  return (c <= static_cast<unsigned char>(kTypeLargeValueRef));
+}
+
+}
+
+#endif  // STORAGE_LEVELDB_DB_FORMAT_H_
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static std::string IKey(const std::string& user_key,
+                        uint64_t seq,
+                        ValueType vt) {
+  std::string encoded;
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+  return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+  return result;
+}
+
+static void TestKey(const std::string& key,
+                    uint64_t seq,
+                    ValueType vt) {
+  std::string encoded = IKey(key, seq, vt);
+
+  Slice in(encoded);
+  ParsedInternalKey decoded("", 0, kTypeValue);
+
+  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_EQ(key, decoded.user_key.ToString());
+  ASSERT_EQ(seq, decoded.sequence);
+  ASSERT_EQ(vt, decoded.type);
+
+  ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest { };
+
+TEST(FormatTest, InternalKey_EncodeDecode) {
+  const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
+  const uint64_t seq[] = {
+    1, 2, 3,
+    (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
+    (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
+    (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
+  };
+  for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+    for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+      TestKey(keys[k], seq[s], kTypeValue);
+      TestKey("hello", 1, kTypeDeletion);
+    }
+  }
+}
+
+TEST(FormatTest, InternalKeyShortSeparator) {
+  // When user keys are same
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 101, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeDeletion)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeLargeValueRef)));
+
+  // When user keys are misordered
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("bar", 99, kTypeValue)));
+
+  // When user keys are different, but correctly ordered
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("hello", 200, kTypeValue)));
+
+  // When start user key is prefix of limit user key
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foobar", 200, kTypeValue)));
+
+  // When limit user key is prefix of start user key
+  ASSERT_EQ(IKey("foobar", 100, kTypeValue),
+            Shorten(IKey("foobar", 100, kTypeValue),
+                    IKey("foo", 200, kTypeValue)));
+}
+
+TEST(FormatTest, InternalKeyShortestSuccessor) {
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+TEST(FormatTest, SHA1) {
+  // Check that we are computing the same value as sha1.
+  // Note that the last two numbers are the length of the input and the
+  // compression type.
+  ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0",  // SHA1, uncompr
+            LargeValueRefToFilenameString(
+                LargeValueRef::Make("hello", kNoCompression)));
+  ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1",  // SHA1, lwcompr
+            LargeValueRefToFilenameString(
+                LargeValueRef::Make("hello", kLightweightCompression)));
+}
+
+}
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/db/filename.cc
+++ b/db/filename.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <ctype.h>
+#include <stdio.h>
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "include/env.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/%06llu.%s",
+           static_cast<unsigned long long>(number),
+           suffix);
+  return name + buf;
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "log");
+}
+
+std::string TableFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "sst");
+}
+
+std::string LargeValueFileName(const std::string& name,
+                               const LargeValueRef& large_ref) {
+  std::string result = name + "/";
+  result += LargeValueRefToFilenameString(large_ref);
+  result += ".val";
+  return result;
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  assert(number > 0);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+  return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+  return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(dbname, number, "dbtmp");
+}
+
+std::string InfoLogFileName(const std::string& dbname) {
+  return dbname + "/LOG";
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname) {
+  return dbname + "/LOG.old";
+}
+
+
+// Owned filenames have the form:
+//    dbname/CURRENT
+//    dbname/LOCK
+//    dbname/LOG
+//    dbname/LOG.old
+//    dbname/MANIFEST-[0-9]+
+//    dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val
+//    dbname/[0-9]+.(log|sst)
+bool ParseFileName(const std::string& fname,
+                   uint64_t* number,
+                   LargeValueRef* large_ref,
+                   FileType* type) {
+  Slice rest(fname);
+  if (rest == "CURRENT") {
+    *number = 0;
+    *type = kCurrentFile;
+  } else if (rest == "LOCK") {
+    *number = 0;
+    *type = kDBLockFile;
+  } else if (rest == "LOG" || rest == "LOG.old") {
+    *number = 0;
+    *type = kInfoLogFile;
+  } else if (rest.size() >= 4 &&
+      Slice(rest.data() + rest.size() - 4, 4) == ".val") {
+    LargeValueRef h;
+    if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4),
+                                       &h)) {
+      return false;
+    }
+    *large_ref = h;
+    *type = kLargeValueFile;
+  } else if (rest.starts_with("MANIFEST-")) {
+    rest.remove_prefix(strlen("MANIFEST-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kDescriptorFile;
+    *number = num;
+  } else {
+    // Avoid strtoull() to keep filename format independent of the
+    // current locale
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    Slice suffix = rest;
+    if (suffix == Slice(".log")) {
+      *type = kLogFile;
+    } else if (suffix == Slice(".sst")) {
+      *type = kTableFile;
+    } else if (suffix == Slice(".dbtmp")) {
+      *type = kTempFile;
+    } else {
+      return false;
+    }
+    *number = num;
+  }
+  return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+                      uint64_t descriptor_number) {
+  // Remove leading "dbname/" and add newline to manifest file name
+  std::string manifest = DescriptorFileName(dbname, descriptor_number);
+  Slice contents = manifest;
+  assert(contents.starts_with(dbname + "/"));
+  contents.remove_prefix(dbname.size() + 1);
+  std::string tmp = TempFileName(dbname, descriptor_number);
+  Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, CurrentFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+}
--- a/db/filename.h
+++ b/db/filename.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#ifndef STORAGE_LEVELDB_DB_FILENAME_H_
+#define STORAGE_LEVELDB_DB_FILENAME_H_
+
+#include <stdint.h>
+#include <string>
+#include "include/slice.h"
+#include "include/status.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class Env;
+struct LargeValueRef;
+
+enum FileType {
+  kLogFile,
+  kDBLockFile,
+  kTableFile,
+  kLargeValueFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the large value file with the specified large
+// value reference in the db named by "dbname".  The result will be
+// prefixed with "dbname".
+extern std::string LargeValueFileName(const std::string& dbname,
+                                      const LargeValueRef& large_ref);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number.  The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+                                      uint64_t number);
+
+// Return the name of the current file.  This file contains the name
+// of the current manifest file.  The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname);
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname);
+
+// If filename is a leveldb file, store the type of the file in *type.
+// If *type is kLargeValueFile, then the large value reference data
+// from the filename is stored in "*large_ref.  For all other types of
+// files, the number encoded in the filename is stored in *number.  If
+// the filename was successfully parsed, returns true.  Else return
+// false.
+extern bool ParseFileName(const std::string& filename,
+                          uint64_t* number,
+                          LargeValueRef* large_ref,
+                          FileType* type);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+                             uint64_t descriptor_number);
+
+
+}
+
+#endif  // STORAGE_LEVELDB_DB_FILENAME_H_
--- a/db/filename_test.cc
+++ b/db/filename_test.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class FileNameTest { };
+
+TEST(FileNameTest, Parse) {
+  Slice db;
+  FileType type;
+  uint64_t number;
+  LargeValueRef large_ref;
+
+  // Successful parses
+  static struct {
+    const char* fname;
+    uint64_t number;
+    const char* large_ref;
+    FileType type;
+  } cases[] = {
+    { "100.log",            100,   "",         kLogFile },
+    { "0.log",              0,     "",         kLogFile },
+    { "0.sst",              0,     "",         kTableFile },
+    { "CURRENT",            0,     "",         kCurrentFile },
+    { "LOCK",               0,     "",         kDBLockFile },
+    { "MANIFEST-2",         2,     "",         kDescriptorFile },
+    { "MANIFEST-7",         7,     "",         kDescriptorFile },
+    { "LOG",                0,     "",         kInfoLogFile },
+    { "LOG.old",            0,     "",         kInfoLogFile },
+    { "18446744073709551615.log", 18446744073709551615ull, "",
+      kLogFile },
+    { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0,
+      "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile },
+    { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0,
+      "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0",
+      kLargeValueFile },
+  };
+  for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    std::string f = cases[i].fname;
+    ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f;
+    ASSERT_EQ(cases[i].type, type) << f;
+    if (type == kLargeValueFile) {
+      ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref))
+          << f;
+    } else {
+      ASSERT_EQ(cases[i].number, number) << f;
+    }
+  }
+
+  // Errors
+  static const char* errors[] = {
+    "",
+    "foo",
+    "foo-dx-100.log",
+    ".log",
+    "",
+    "manifest",
+    "CURREN",
+    "CURRENTX",
+    "MANIFES",
+    "MANIFEST",
+    "MANIFEST-",
+    "XMANIFEST-3",
+    "MANIFEST-3x",
+    "LOC",
+    "LOCKx",
+    "LO",
+    "LOGx",
+    "18446744073709551616.log",
+    "184467440737095516150.log",
+    "100",
+    "100.",
+    "100.lop",
+    "100.val",
+    ".val",
+    "123456789012345678901234567890123456789-12340.val",
+    "1234567890123456789012345678901234567-123-0.val",
+    "12345678901234567890123456789012345678902-100-1-.val",
+    // Overflow on value size
+    "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val",
+    // '03.val' is a bad compression type
+    "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" };
+  for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+    std::string f = errors[i];
+    ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f;
+  };
+}
+
+TEST(FileNameTest, Construction) {
+  uint64_t number;
+  FileType type;
+  LargeValueRef large_ref;
+  std::string fname;
+
+  fname = CurrentFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+  ASSERT_EQ(0, number);
+  ASSERT_EQ(kCurrentFile, type);
+
+  fname = LockFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+  ASSERT_EQ(0, number);
+  ASSERT_EQ(kDBLockFile, type);
+
+  fname = LogFileName("foo", 192);
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+  ASSERT_EQ(192, number);
+  ASSERT_EQ(kLogFile, type);
+
+  fname = TableFileName("bar", 200);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+  ASSERT_EQ(200, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = DescriptorFileName("bar", 100);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+  ASSERT_EQ(100, number);
+  ASSERT_EQ(kDescriptorFile, type);
+
+  fname = TempFileName("tmp", 999);
+  ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+  ASSERT_EQ(999, number);
+  ASSERT_EQ(kTempFile, type);
+
+  for (int i = 0; i <= kLightweightCompression; i++) {
+    CompressionType ctype = static_cast<CompressionType>(i);
+    std::string value = "abcdef";
+    LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype);
+    fname = LargeValueFileName("tmp", real_large_ref);
+    ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+    ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+    ASSERT_TRUE(real_large_ref == large_ref);
+    ASSERT_EQ(kLargeValueFile, type);
+    ASSERT_EQ(large_ref.compression_type(), ctype);
+  }
+}
+
+}
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/db/log_format.h
+++ b/db/log_format.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
+#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
+
+namespace leveldb {
+namespace log {
+
+enum RecordType {
+  // Zero is reserved for preallocated files
+  kZeroType = 0,
+
+  kFullType = 1,
+
+  // For fragments
+  kFirstType = 2,
+  kMiddleType = 3,
+  kLastType = 4,
+};
+static const int kMaxRecordType = kLastType;
+
+static const int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
+static const int kHeaderSize = 4 + 1 + 2;
+
+}
+}
+
+#endif  // STORAGE_LEVELDB_DB_LOG_FORMAT_H_
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdint.h>
+#include "include/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+namespace log {
+
+Reader::Reporter::~Reporter() {
+}
+
+Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum)
+    : file_(file),
+      reporter_(reporter),
+      checksum_(checksum),
+      backing_store_(new char[kBlockSize]),
+      buffer_(),
+      eof_(false) {
+}
+
+Reader::~Reader() {
+  delete[] backing_store_;
+}
+
+bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+  scratch->clear();
+  record->clear();
+  bool in_fragmented_record = false;
+
+  Slice fragment;
+  while (true) {
+    switch (ReadPhysicalRecord(&fragment)) {
+      case kFullType:
+        if (in_fragmented_record) {
+          ReportDrop(scratch->size(), "partial record without end");
+        }
+        scratch->clear();
+        *record = fragment;
+        return true;
+
+      case kFirstType:
+        if (in_fragmented_record) {
+          ReportDrop(scratch->size(), "partial record without end");
+        }
+        scratch->assign(fragment.data(), fragment.size());
+        in_fragmented_record = true;
+        break;
+
+      case kMiddleType:
+        if (!in_fragmented_record) {
+          ReportDrop(fragment.size(), "missing start of fragmented record");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+        if (!in_fragmented_record) {
+          ReportDrop(fragment.size(), "missing start of fragmented record");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+          *record = Slice(*scratch);
+          return true;
+        }
+        break;
+
+      case kEof:
+        if (in_fragmented_record) {
+          ReportDrop(scratch->size(), "partial record without end");
+          scratch->clear();
+        }
+        return false;
+
+      case kBadRecord:
+        if (in_fragmented_record) {
+          ReportDrop(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      default:
+        ReportDrop(
+            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+            "unknown record type");
+        in_fragmented_record = false;
+        scratch->clear();
+        break;
+    }
+  }
+  return false;
+}
+
+void Reader::ReportDrop(size_t bytes, const char* reason) {
+  if (reporter_ != NULL) {
+    reporter_->Corruption(bytes, Status::Corruption(reason));
+  }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result) {
+  while (true) {
+    if (buffer_.size() <= kHeaderSize) {
+      if (!eof_) {
+        // Last read was a full read, so this is a trailer to skip
+        buffer_.clear();
+        Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+        if (!status.ok()) {
+          if (reporter_ != NULL) {
+            reporter_->Corruption(kBlockSize, status);
+          }
+          buffer_.clear();
+          eof_ = true;
+          return kEof;
+        } else if (buffer_.size() < kBlockSize) {
+          eof_ = true;
+        }
+        continue;
+      } else if (buffer_.size() == 0) {
+        // End of file
+        return kEof;
+      } else if (buffer_.size() < kHeaderSize) {
+        ReportDrop(buffer_.size(), "truncated record at end of file");
+        buffer_.clear();
+        return kEof;
+      } else {
+        // We have a trailing zero-length record.  Fall through and check it.
+      }
+    }
+
+    // Parse the header
+    const char* header = buffer_.data();
+    const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+    const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+    const unsigned int type = header[6];
+    const uint32_t length = a | (b << 8);
+    if (kHeaderSize + length > buffer_.size()) {
+      ReportDrop(buffer_.size(), "bad record length");
+      buffer_.clear();
+      return kBadRecord;
+    }
+
+    // Check crc
+    if (checksum_) {
+      if (type == kZeroType && length == 0) {
+        // Skip zero length record
+        buffer_.remove_prefix(kHeaderSize + length);
+        return kBadRecord;
+      }
+
+      uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+      uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
+      if (actual_crc != expected_crc) {
+        ReportDrop(length, "checksum mismatch");
+        buffer_.remove_prefix(kHeaderSize + length);
+        return kBadRecord;
+      }
+    }
+
+    buffer_.remove_prefix(kHeaderSize + length);
+    *result = Slice(header + kHeaderSize, length);
+    return type;
+  }
+}
+
+}
+}
--- a/db/log_reader.h
+++ b/db/log_reader.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
+#define STORAGE_LEVELDB_DB_LOG_READER_H_
+
+#include "db/log_format.h"
+#include "include/slice.h"
+#include "include/status.h"
+
+namespace leveldb {
+
+class SequentialFile;
+
+namespace log {
+
+class Reader {
+ public:
+  // Interface for reporting errors.
+  class Reporter {
+   public:
+    virtual ~Reporter();
+
+    // Some corruption was detected.  "size" is the approximate number
+    // of bytes dropped due to the corruption.
+    virtual void Corruption(size_t bytes, const Status& status) = 0;
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  //
+  // If "reporter" is non-NULL, it is notified whenever some data is
+  // dropped due to a detected corruption.  "*reporter" must remain
+  // live while this Reader is in use.
+  //
+  // If "checksum" is true, verify checksums if available.
+  Reader(SequentialFile* file, Reporter* reporter, bool checksum);
+
+  ~Reader();
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input.  May use
+  // "*scratch" as temporary storage.  The contents filled in *record
+  // will only be valid until the next mutating operation on this
+  // reader or the next mutation to *scratch.
+  bool ReadRecord(Slice* record, std::string* scratch);
+
+ private:
+  SequentialFile* const file_;
+  Reporter* const reporter_;
+  bool const checksum_;
+  char* const backing_store_;
+  Slice buffer_;
+  bool eof_;   // Last Read() indicated EOF by returning < kBlockSize
+
+  // Extend record types with the following special values
+  enum {
+    kEof = kMaxRecordType + 1,
+    kBadRecord = kMaxRecordType + 2
+  };
+
+  // Return type, or one of the preceding special values
+  unsigned int ReadPhysicalRecord(Slice* result);
+  void ReportDrop(size_t bytes, const char* reason);
+
+  // No copying allowed
+  Reader(const Reader&);
+  void operator=(const Reader&);
+};
+
+}
+}
+
+#endif  // STORAGE_LEVELDB_DB_LOG_READER_H_
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -0,0 +1,361 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "include/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
+  while (result.size() < n) {
+    result.append(partial_string);
+  }
+  result.resize(n);
+  return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%d.", n);
+  return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+  return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+class LogTest {
+ private:
+  class StringDest : public WritableFile {
+   public:
+    std::string contents_;
+
+    virtual Status Close() { return Status::OK(); }
+    virtual Status Flush() { return Status::OK(); }
+    virtual Status Sync() { return Status::OK(); }
+    virtual Status Append(const Slice& slice) {
+      contents_.append(slice.data(), slice.size());
+      return Status::OK();
+    }
+  };
+
+  class StringSource : public SequentialFile {
+   public:
+    Slice contents_;
+    bool force_error_;
+    bool returned_partial_;
+    StringSource() : force_error_(false), returned_partial_(false) { }
+
+    virtual Status Read(size_t n, Slice* result, char* scratch) {
+      ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+      ASSERT_EQ(kBlockSize, n);
+
+      if (force_error_) {
+        force_error_ = false;
+        returned_partial_ = true;
+        return Status::Corruption("read error");
+      }
+
+      if (contents_.size() < n) {
+        n = contents_.size();
+        returned_partial_ = true;
+      }
+      *result = Slice(contents_.data(), n);
+      contents_.remove_prefix(n);
+      return Status::OK();
+    }
+  };
+
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) { }
+    virtual void Corruption(size_t bytes, const Status& status) {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  StringDest dest_;
+  StringSource source_;
+  ReportCollector report_;
+  bool reading_;
+  Writer writer_;
+  Reader reader_;
+
+ public:
+  LogTest() : reading_(false),
+              writer_(&dest_),
+              reader_(&source_, &report_, true/*checksum*/) {
+  }
+
+  void Write(const std::string& msg) {
+    ASSERT_TRUE(!reading_) << "Write() after starting to read";
+    writer_.AddRecord(Slice(msg));
+  }
+
+  size_t WrittenBytes() const {
+    return dest_.contents_.size();
+  }
+
+  std::string Read() {
+    if (!reading_) {
+      reading_ = true;
+      source_.contents_ = Slice(dest_.contents_);
+    }
+    std::string scratch;
+    Slice record;
+    if (reader_.ReadRecord(&record, &scratch)) {
+      return record.ToString();
+    } else {
+      return "EOF";
+    }
+  }
+
+  void IncrementByte(int offset, int delta) {
+    dest_.contents_[offset] += delta;
+  }
+
+  void SetByte(int offset, char new_byte) {
+    dest_.contents_[offset] = new_byte;
+  }
+
+  void ShrinkSize(int bytes) {
+    dest_.contents_.resize(dest_.contents_.size() - bytes);
+  }
+
+  void FixChecksum(int header_offset, int len) {
+    // Compute crc of type/len/data
+    uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len);
+    crc = crc32c::Mask(crc);
+    EncodeFixed32(&dest_.contents_[header_offset], crc);
+  }
+
+  void ForceError() {
+    source_.force_error_ = true;
+  }
+
+  size_t DroppedBytes() const {
+    return report_.dropped_bytes_;
+  }
+
+  // Returns OK iff recorded error message contains "msg"
+  std::string MatchError(const std::string& msg) const {
+    if (report_.message_.find(msg) == std::string::npos) {
+      return report_.message_;
+    } else {
+      return "OK";
+    }
+  }
+};
+
+TEST(LogTest, Empty) {
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ReadWrite) {
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST(LogTest, ManyBlocks) {
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, Fragmentation) {
+  Write("small");
+  Write(BigString("medium", 50000));
+  Write(BigString("large", 100000));
+  ASSERT_EQ("small", Read());
+  ASSERT_EQ(BigString("medium", 50000), Read());
+  ASSERT_EQ(BigString("large", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ShortTrailer) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, AlignedEof) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, RandomRead) {
+  const int N = 500;
+  Random write_rnd(301);
+  for (int i = 0; i < N; i++) {
+    Write(RandomSkewedString(i, &write_rnd));
+  }
+  Random read_rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST(LogTest, ReadError) {
+  Write("foo");
+  ForceError();
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST(LogTest, BadRecordType) {
+  Write("foo");
+  // Type is stored in header[6]
+  IncrementByte(6, 100);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST(LogTest, TruncatedTrailingRecord) {
+  Write("foo");
+  ShrinkSize(4);   // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("truncated record at end of file"));
+}
+
+TEST(LogTest, BadLength) {
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("bad record length"));
+}
+
+TEST(LogTest, ChecksumMismatch) {
+  Write("foo");
+  IncrementByte(0, 10);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("checksum mismatch"));
+}
+
+TEST(LogTest, UnexpectedMiddleType) {
+  Write("foo");
+  SetByte(6, kMiddleType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedLastType) {
+  Write("foo");
+  SetByte(6, kLastType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedFullType) {
+  Write("foo");
+  Write("bar");
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, UnexpectedFirstType) {
+  Write("foo");
+  Write(BigString("bar", 100000));
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ(BigString("bar", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, ErrorJoinsRecords) {
+  // Consider two fragmented records:
+  //    first(R1) last(R1) first(R2) last(R2)
+  // where the middle two fragments disappear.  We do not want
+  // first(R1),last(R2) to get joined and returned as a valid record.
+
+  // Write records that span two blocks
+  Write(BigString("foo", kBlockSize));
+  Write(BigString("bar", kBlockSize));
+  Write("correct");
+
+  // Wipe the middle block
+  for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
+    SetByte(offset, 'x');
+  }
+
+  ASSERT_EQ("correct", Read());
+  ASSERT_EQ("EOF", Read());
+  const int dropped = DroppedBytes();
+  ASSERT_LE(dropped, 2*kBlockSize + 100);
+  ASSERT_GE(dropped, 2*kBlockSize);
+}
+
+}
+}
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+#include "include/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+namespace log {
+
+Writer::Writer(WritableFile* dest)
+    : dest_(dest),
+      block_offset_(0) {
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
+}
+
+Writer::~Writer() {
+}
+
+Status Writer::AddRecord(const Slice& slice) {
+  const char* ptr = slice.data();
+  size_t left = slice.size();
+
+  // Fragment the record if necessary and emit it.  Note that if slice
+  // is empty, we still want to iterate once to emit a single
+  // zero-length record
+  Status s;
+  do {
+    const int leftover = kBlockSize - block_offset_;
+    assert(leftover >= 0);
+    if (leftover <= kHeaderSize) {
+      // Switch to a new block
+      if (leftover > 0) {
+        // Fill the trailer
+        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00", leftover));
+      }
+      block_offset_ = 0;
+    }
+
+    // Invariant: we never leave <= kHeaderSize bytes in a block.
+    const int avail = kBlockSize - block_offset_ - kHeaderSize;
+    assert(avail > 0);
+
+    const size_t fragment_length = (left < avail) ? left : avail;
+
+    RecordType type;
+    const bool begin = (ptr == slice.data());
+    const bool end = (left == fragment_length);
+    if (begin && end) {
+      type = kFullType;
+    } else if (begin) {
+      type = kFirstType;
+    } else if (end) {
+      type = kLastType;
+    } else {
+      type = kMiddleType;
+    }
+
+    s = EmitPhysicalRecord(type, ptr, fragment_length);
+    ptr += fragment_length;
+    left -= fragment_length;
+  } while (s.ok() && left > 0);
+  return s;
+}
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+  assert(n <= 0xffff);  // Must fit in two bytes
+  assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+
+  // Format the header
+  char buf[kHeaderSize];
+  buf[4] = static_cast<char>(n & 0xff);
+  buf[5] = static_cast<char>(n >> 8);
+  buf[6] = static_cast<char>(t);
+
+  // Compute the crc of the record type and the payload.
+  uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
+  crc = crc32c::Mask(crc);                 // Adjust for storage
+  EncodeFixed32(buf, crc);
+
+  // Write the header and the payload
+  Status s = dest_->Append(Slice(buf, kHeaderSize));
+  if (s.ok()) {
+    s = dest_->Append(Slice(ptr, n));
+    if (s.ok()) {
+      s = dest_->Flush();
+    }
+  }
+  block_offset_ += kHeaderSize + n;
+  return s;
+}
+
+}
+}
--- a/db/log_writer.h
+++ b/db/log_writer.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_
+#define STORAGE_LEVELDB_DB_LOG_WRITER_H_
+
+#include <stdint.h>
+#include "db/log_format.h"
+#include "include/slice.h"
+#include "include/status.h"
+
+namespace leveldb {
+
+class WritableFile;
+
+namespace log {
+
+class Writer {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit Writer(WritableFile* dest);
+  ~Writer();
+
+  Status AddRecord(const Slice& slice);
+
+ private:
+  WritableFile* dest_;
+  int block_offset_;       // Current offset in block
+
+  // crc32c values for all supported record types.  These are
+  // pre-computed to reduce the overhead of computing the crc of the
+  // record type stored in the header.
+  uint32_t type_crc_[kMaxRecordType + 1];
+
+  Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+  // No copying allowed
+  Writer(const Writer&);
+  void operator=(const Writer&);
+};
+
+}
+}
+
+#endif  // STORAGE_LEVELDB_DB_LOG_WRITER_H_
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+#include "db/dbformat.h"
+#include "include/comparator.h"
+#include "include/env.h"
+#include "include/iterator.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+static Slice GetLengthPrefixedSlice(const char* data) {
+  uint32_t len;
+  const char* p = data;
+  p = GetVarint32Ptr(p, p + 5, &len);  // +5: we assume "p" is not corrupted
+  return Slice(p, len);
+}
+
+MemTable::MemTable(const InternalKeyComparator& cmp)
+    : comparator_(cmp),
+      table_(comparator_, &arena_) {
+}
+
+MemTable::~MemTable() {
+}
+
+size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
+
+int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
+    const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice a = GetLengthPrefixedSlice(aptr);
+  Slice b = GetLengthPrefixedSlice(bptr);
+  return comparator.Compare(a, b);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+static const char* EncodeKey(std::string* scratch, const Slice& target) {
+  scratch->clear();
+  PutVarint32(scratch, target.size());
+  scratch->append(target.data(), target.size());
+  return scratch->data();
+}
+
+class MemTableIterator: public Iterator {
+ public:
+  explicit MemTableIterator(MemTable::Table* table) {
+    iter_ = new MemTable::Table::Iterator(table);
+  }
+  virtual ~MemTableIterator() { delete iter_; }
+
+  virtual bool Valid() const { return iter_->Valid(); }
+  virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
+  virtual void SeekToFirst() { iter_->SeekToFirst(); }
+  virtual void SeekToLast() { iter_->SeekToLast(); }
+  virtual void Next() { iter_->Next(); }
+  virtual void Prev() { iter_->Prev(); }
+  virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); }
+  virtual Slice value() const {
+    Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+  }
+
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  MemTable::Table::Iterator* iter_;
+  std::string tmp_;       // For passing to EncodeKey
+
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&);
+  void operator=(const MemTableIterator&);
+};
+
+Iterator* MemTable::NewIterator() {
+  return new MemTableIterator(&table_);
+}
+
+void MemTable::Add(SequenceNumber s, ValueType type,
+                   const Slice& key,
+                   const Slice& value) {
+  // Format of an entry is concatenation of:
+  //  key_size     : varint32 of internal_key.size()
+  //  key bytes    : char[internal_key.size()]
+  //  value_size   : varint32 of value.size()
+  //  value bytes  : char[value.size()]
+  size_t key_size = key.size();
+  size_t val_size = value.size();
+  size_t internal_key_size = key_size + 8;
+  const size_t encoded_len =
+      VarintLength(internal_key_size) + internal_key_size +
+      VarintLength(val_size) + val_size;
+  char* buf = arena_.Allocate(encoded_len);
+  char* p = EncodeVarint32(buf, internal_key_size);
+  memcpy(p, key.data(), key_size);
+  p += key_size;
+  EncodeFixed64(p, (s << 8) | type);
+  p += 8;
+  p = EncodeVarint32(p, val_size);
+  memcpy(p, value.data(), val_size);
+  assert((p + val_size) - buf == encoded_len);
+  table_.Insert(buf);
+}
+
+}
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_
+#define STORAGE_LEVELDB_DB_MEMTABLE_H_
+
+#include <string>
+#include "include/db.h"
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "util/arena.h"
+
+namespace leveldb {
+
+class InternalKeyComparator;
+class Mutex;
+class MemTableIterator;
+
+class MemTable {
+ public:
+  explicit MemTable(const InternalKeyComparator& comparator);
+  ~MemTable();
+
+  // Returns an estimate of the number of bytes of data in use by this
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  size_t ApproximateMemoryUsage();
+
+  // Return an iterator that yields the contents of the memtable.
+  //
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.  The keys returned by this
+  // iterator are internal keys encoded by AppendInternalKey in the
+  // db/format.{h,cc} module.
+  Iterator* NewIterator();
+
+  // Add an entry into memtable that maps key to value at the
+  // specified sequence number and with the specified type.
+  // Typically value will be empty if type==kTypeDeletion.
+  void Add(SequenceNumber seq, ValueType type,
+           const Slice& key,
+           const Slice& value);
+
+ private:
+  struct KeyComparator {
+    const InternalKeyComparator comparator;
+    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
+    int operator()(const char* a, const char* b) const;
+  };
+  friend class MemTableIterator;
+  friend class MemTableBackwardIterator;
+
+  typedef SkipList<const char*, KeyComparator> Table;
+
+  KeyComparator comparator_;
+  Arena arena_;
+  Table table_;
+
+  // No copying allowed
+  MemTable(const MemTable&);
+  void operator=(const MemTable&);
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_DB_MEMTABLE_H_
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -0,0 +1,396 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// We recover the contents of the descriptor from the other files we find.
+// (1) Any log files are first converted to tables
+// (2) We scan every table to compute
+//     (a) smallest/largest for the table
+//     (b) large value refs from the table
+//     (c) largest sequence number in the table
+// (3) We generate descriptor contents:
+//      - log number is set to zero
+//      - next-file-number is set to 1 + largest file number we found
+//      - last-sequence-number is set to largest sequence# found across
+//        all tables (see 2c)
+//      - compaction pointers are cleared
+//      - every table file is added at level 0
+//
+// Possible optimization 1:
+//   (a) Compute total size and use to pick appropriate max-level M
+//   (b) Sort tables by largest sequence# in the table
+//   (c) For each table: if it overlaps earlier table, place in level-0,
+//       else place in level-M.
+// Possible optimization 2:
+//   Store per-table metadata (smallest, largest, largest-seq#,
+//   large-value-refs, ...) in the table's meta section to speed up
+//   ScanTable.
+
+#include "db/builder.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "include/comparator.h"
+#include "include/db.h"
+#include "include/env.h"
+
+namespace leveldb {
+
+namespace {
+
+class Repairer {
+ public:
+  Repairer(const std::string& dbname, const Options& options)
+      : dbname_(dbname),
+        env_(options.env),
+        icmp_(options.comparator),
+        options_(SanitizeOptions(dbname, &icmp_, options)),
+        owns_info_log_(options_.info_log != options.info_log),
+        next_file_number_(1) {
+    // TableCache can be small since we expect each table to be opened once.
+    table_cache_ = new TableCache(dbname_, &options_, 10);
+  }
+
+  ~Repairer() {
+    delete table_cache_;
+    if (owns_info_log_) {
+      delete options_.info_log;
+    }
+  }
+
+  Status Run() {
+    Status status = FindFiles();
+    if (status.ok()) {
+      ConvertLogFilesToTables();
+      ExtractMetaData();
+      status = WriteDescriptor();
+    }
+    if (status.ok()) {
+      unsigned long long bytes = 0;
+      for (int i = 0; i < tables_.size(); i++) {
+        bytes += tables_[i].meta.file_size;
+      }
+      Log(env_, options_.info_log,
+          "**** Repaired leveldb %s; "
+          "recovered %d files; %llu bytes. "
+          "Some data may have been lost. "
+          "****",
+          dbname_.c_str(),
+          static_cast<int>(tables_.size()),
+          bytes);
+    }
+    return status;
+  }
+
+ private:
+  struct TableInfo {
+    FileMetaData meta;
+    SequenceNumber max_sequence;
+  };
+
+  std::string const dbname_;
+  Env* const env_;
+  InternalKeyComparator const icmp_;
+  Options const options_;
+  bool owns_info_log_;
+  TableCache* table_cache_;
+  VersionEdit edit_;
+
+  std::vector<std::string> manifests_;
+  std::vector<uint64_t> table_numbers_;
+  std::vector<uint64_t> logs_;
+  std::vector<TableInfo> tables_;
+  uint64_t next_file_number_;
+
+  Status FindFiles() {
+    std::vector<std::string> filenames;
+    Status status = env_->GetChildren(dbname_, &filenames);
+    if (!status.ok()) {
+      return status;
+    }
+    if (filenames.empty()) {
+      return Status::IOError(dbname_, "repair found no files");
+    }
+
+    uint64_t number;
+    LargeValueRef large_ref;
+    FileType type;
+    for (int i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &large_ref, &type)) {
+        if (type == kLargeValueFile) {
+          // Will be picked up when we process a Table that points to it
+        } else if (type == kDescriptorFile) {
+          manifests_.push_back(filenames[i]);
+        } else {
+          if (number + 1 > next_file_number_) {
+            next_file_number_ = number + 1;
+          }
+          if (type == kLogFile) {
+            logs_.push_back(number);
+          } else if (type == kTableFile) {
+            table_numbers_.push_back(number);
+          } else {
+            // Ignore other files
+          }
+        }
+      }
+    }
+    return status;
+  }
+
+  void ConvertLogFilesToTables() {
+    for (int i = 0; i < logs_.size(); i++) {
+      std::string logname = LogFileName(dbname_, logs_[i]);
+      Status status = ConvertLogToTable(logs_[i]);
+      if (!status.ok()) {
+        Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s",
+            (unsigned long long) logs_[i],
+            status.ToString().c_str());
+      }
+      ArchiveFile(logname);
+    }
+  }
+
+  Status ConvertLogToTable(uint64_t log) {
+    struct LogReporter : public log::Reader::Reporter {
+      Env* env;
+      WritableFile* info_log;
+      uint64_t lognum;
+      virtual void Corruption(size_t bytes, const Status& s) {
+        // We print error messages for corruption, but continue repairing.
+        Log(env, info_log, "Log #%llu: dropping %d bytes; %s",
+            (unsigned long long) lognum,
+            static_cast<int>(bytes),
+            s.ToString().c_str());
+      }
+    };
+
+    // Open the log file
+    std::string logname = LogFileName(dbname_, log);
+    SequentialFile* lfile;
+    Status status = env_->NewSequentialFile(logname, &lfile);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = options_.info_log;
+    reporter.lognum = log;
+    // We intentially make log::Reader do checksumming so that
+    // corruptions cause entire commits to be skipped instead of
+    // propagating bad information (like overly large sequence
+    // numbers).
+    log::Reader reader(lfile, &reporter, false/*do not checksum*/);
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    MemTable mem(icmp_);
+    int counter = 0;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter.Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
+      status = WriteBatchInternal::InsertInto(&batch, &mem);
+      if (status.ok()) {
+        counter += WriteBatchInternal::Count(&batch);
+      } else {
+        Log(env_, options_.info_log, "Log #%llu: ignoring %s",
+            (unsigned long long) log,
+            status.ToString().c_str());
+        status = Status::OK();  // Keep going with rest of file
+      }
+    }
+    delete lfile;
+
+    // We ignore any version edits generated by the conversion to a Table
+    // since ExtractMetaData() will also generate edits.
+    VersionEdit skipped;
+    FileMetaData meta;
+    meta.number = next_file_number_++;
+    Iterator* iter = mem.NewIterator();
+    status = BuildTable(dbname_, env_, options_, table_cache_, iter,
+                        &meta, &skipped);
+    delete iter;
+    if (status.ok()) {
+      if (meta.file_size > 0) {
+        table_numbers_.push_back(meta.number);
+      }
+    }
+    Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
+        (unsigned long long) log,
+        counter,
+        (unsigned long long) meta.number,
+        status.ToString().c_str());
+    return status;
+  }
+
+  void ExtractMetaData() {
+    std::vector<TableInfo> kept;
+    for (int i = 0; i < table_numbers_.size(); i++) {
+      TableInfo t;
+      t.meta.number = table_numbers_[i];
+      Status status = ScanTable(&t);
+      if (!status.ok()) {
+        std::string fname = TableFileName(dbname_, table_numbers_[i]);
+        Log(env_, options_.info_log, "Table #%llu: ignoring %s",
+            (unsigned long long) table_numbers_[i],
+            status.ToString().c_str());
+        ArchiveFile(fname);
+      } else {
+        tables_.push_back(t);
+      }
+    }
+  }
+
+  Status ScanTable(TableInfo* t) {
+    std::string fname = TableFileName(dbname_, t->meta.number);
+    int counter = 0;
+    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    if (status.ok()) {
+      Iterator* iter = table_cache_->NewIterator(
+          ReadOptions(), t->meta.number);
+      bool empty = true;
+      ParsedInternalKey parsed;
+      t->max_sequence = 0;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (!ParseInternalKey(key, &parsed)) {
+          Log(env_, options_.info_log, "Table #%llu: unparsable key %s",
+              (unsigned long long) t->meta.number,
+              EscapeString(key).c_str());
+          continue;
+        }
+
+        counter++;
+        if (empty) {
+          empty = false;
+          t->meta.smallest.DecodeFrom(key);
+        }
+        t->meta.largest.DecodeFrom(key);
+        if (parsed.sequence > t->max_sequence) {
+          t->max_sequence = parsed.sequence;
+        }
+
+        if (ExtractValueType(key) == kTypeLargeValueRef) {
+          if (iter->value().size() != LargeValueRef::ByteSize()) {
+            Log(env_, options_.info_log, "Table #%llu: bad large value ref",
+                (unsigned long long) t->meta.number);
+          } else {
+            edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
+                                   t->meta.number,
+                                   key);
+          }
+        }
+      }
+      if (!iter->status().ok()) {
+        status = iter->status();
+      }
+      delete iter;
+    }
+    Log(env_, options_.info_log, "Table #%llu: %d entries %s",
+        (unsigned long long) t->meta.number,
+        counter,
+        status.ToString().c_str());
+    return status;
+  }
+
+  Status WriteDescriptor() {
+    std::string tmp = TempFileName(dbname_, 1);
+    WritableFile* file;
+    Status status = env_->NewWritableFile(tmp, &file);
+    if (!status.ok()) {
+      return status;
+    }
+
+    SequenceNumber max_sequence = 0;
+    for (int i = 0; i < tables_.size(); i++) {
+      if (max_sequence < tables_[i].max_sequence) {
+        max_sequence = tables_[i].max_sequence;
+      }
+    }
+
+    edit_.SetComparatorName(icmp_.user_comparator()->Name());
+    edit_.SetLogNumber(0);
+    edit_.SetNextFile(next_file_number_);
+    edit_.SetLastSequence(max_sequence);
+
+    for (int i = 0; i < tables_.size(); i++) {
+      // TODO(opt): separate out into multiple levels
+      const TableInfo& t = tables_[i];
+      edit_.AddFile(0, t.meta.number, t.meta.file_size,
+                    t.meta.smallest, t.meta.largest);
+    }
+
+    //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
+    {
+      log::Writer log(file);
+      std::string record;
+      edit_.EncodeTo(&record);
+      status = log.AddRecord(record);
+    }
+    if (status.ok()) {
+      status = file->Close();
+    }
+    delete file;
+    file = NULL;
+
+    if (!status.ok()) {
+      env_->DeleteFile(tmp);
+    } else {
+      // Discard older manifests
+      for (int i = 0; i < manifests_.size(); i++) {
+        ArchiveFile(dbname_ + "/" + manifests_[i]);
+      }
+
+      // Install new manifest
+      status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
+      if (status.ok()) {
+        status = SetCurrentFile(env_, dbname_, 1);
+      } else {
+        env_->DeleteFile(tmp);
+      }
+    }
+    return status;
+  }
+
+  void ArchiveFile(const std::string& fname) {
+    // Move into another directory.  E.g., for
+    //    dir/foo
+    // rename to
+    //    dir/lost/foo
+    const char* slash = strrchr(fname.c_str(), '/');
+    std::string new_dir;
+    if (slash != NULL) {
+      new_dir.assign(fname.data(), slash - fname.data());
+    }
+    new_dir.append("/lost");
+    env_->CreateDir(new_dir);  // Ignore error
+    std::string new_file = new_dir;
+    new_file.append("/");
+    new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
+    Status s = env_->RenameFile(fname, new_file);
+    Log(env_, options_.info_log, "Archiving %s: %s\n",
+        fname.c_str(), s.ToString().c_str());
+  }
+};
+}
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+  Repairer repairer(dbname, options);
+  return repairer.Run();
+}
+
+}
--- a/db/skiplist.h
+++ b/db/skiplist.h
@@ -0,0 +1,378 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress.  Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed.  This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+
+#include <assert.h>
+#include <stdlib.h>
+#include "port/port.h"
+#include "util/arena.h"
+#include "util/random.h"
+
+namespace leveldb {
+
+class Arena;
+
+template<typename Key, class Comparator>
+class SkipList {
+ private:
+  struct Node;
+
+ public:
+  // Create a new SkipList object that will use "cmp" for comparing keys,
+  // and will allocate memory using "*arena".  Objects allocated in the arena
+  // must remain allocated for the lifetime of the skiplist object.
+  explicit SkipList(Comparator cmp, Arena* arena);
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  void Insert(const Key& key);
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  bool Contains(const Key& key) const;
+
+  // Iteration over the contents of a skip list
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(const SkipList* list);
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const Key& key() const;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next();
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev();
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Key& target);
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToFirst();
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToLast();
+
+   private:
+    const SkipList* list_;
+    Node* node_;
+    // Intentionally copyable
+  };
+
+ private:
+  enum { kMaxHeight = 12 };
+
+  // Immutable after construction
+  Comparator const compare_;
+  Arena* const arena_;    // Arena used for allocations of nodes
+
+  Node* const head_;
+
+  // Modified only by Insert().  Read racily by readers, but stale
+  // values are ok.
+  port::AtomicPointer max_height_;   // Height of the entire list
+
+  inline int GetMaxHeight() const {
+    return reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load());
+  }
+
+  // Read/written only by Insert().
+  Random rnd_;
+
+  Node* NewNode(const Key& key, int height);
+  int RandomHeight();
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+  // Return true if key is greater than the data stored in "n"
+  bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+  // Return the earliest node that comes at or after key.
+  // Return NULL if there is no such node.
+  //
+  // If prev is non-NULL, fills prev[level] with pointer to previous
+  // node at "level" for every level in [0..max_height_-1].
+  Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+
+  // Return the latest node with a key < key.
+  // Return head_ if there is no such node.
+  Node* FindLessThan(const Key& key) const;
+
+  // Return the last node in the list.
+  // Return head_ if list is empty.
+  Node* FindLast() const;
+
+  // No copying allowed
+  SkipList(const SkipList&);
+  void operator=(const SkipList&);
+};
+
+// Implementation details follow
+template<typename Key, class Comparator>
+struct SkipList<Key,Comparator>::Node {
+  explicit Node(const Key& k) : key(k) { }
+
+  Key const key;
+
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next(int n) {
+    assert(n >= 0);
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+  }
+  void SetNext(int n, Node* x) {
+    assert(n >= 0);
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_[n].Release_Store(x);
+  }
+
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next(int n) {
+    assert(n >= 0);
+    return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+  }
+  void NoBarrier_SetNext(int n, Node* x) {
+    assert(n >= 0);
+    next_[n].NoBarrier_Store(x);
+  }
+
+ private:
+  // Array of length equal to the node height.  next_[0] is lowest level link.
+  port::AtomicPointer next_[1];
+};
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
+  char* mem = arena_->AllocateAligned(
+      sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+  return new (mem) Node(key);
+}
+
+template<typename Key, class Comparator>
+inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
+  list_ = list;
+  node_ = NULL;
+}
+
+template<typename Key, class Comparator>
+inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
+  return node_ != NULL;
+}
+
+template<typename Key, class Comparator>
+inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
+  assert(Valid());
+  return node_->key;
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Next() {
+  assert(Valid());
+  node_ = node_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Prev() {
+  // Instead of using explicit "prev" links, we just search for the
+  // last node that falls before key.
+  assert(Valid());
+  node_ = list_->FindLessThan(node_->key);
+  if (node_ == list_->head_) {
+    node_ = NULL;
+  }
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
+  node_ = list_->FindGreaterOrEqual(target, NULL);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
+  node_ = list_->head_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
+  node_ = list_->FindLast();
+  if (node_ == list_->head_) {
+    node_ = NULL;
+  }
+}
+
+template<typename Key, class Comparator>
+int SkipList<Key,Comparator>::RandomHeight() {
+  // Increase height with probability 1 in kBranching
+  static const unsigned int kBranching = 4;
+  int height = 1;
+  while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
+    height++;
+  }
+  assert(height > 0);
+  assert(height <= kMaxHeight);
+  return height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+  // NULL n is considered infinite
+  return (n != NULL) && (compare_(n->key, key) < 0);
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
+    const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    if (KeyIsAfterNode(key, next)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      if (prev != NULL) prev[level] = x;
+      if (level == 0) {
+        return next;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    assert(x == head_ || compare_(x->key, key) < 0);
+    Node* next = x->Next(level);
+    if (next == NULL || compare_(next->key, key) >= 0) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
+    const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    if (next == NULL) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
+    : compare_(cmp),
+      arena_(arena),
+      head_(NewNode(0 /* any key will do */, kMaxHeight)),
+      max_height_(reinterpret_cast<void*>(1)),
+      rnd_(0xdeadbeef) {
+  for (int i = 0; i < kMaxHeight; i++) {
+    head_->SetNext(i, NULL);
+  }
+}
+
+template<typename Key, class Comparator>
+void SkipList<Key,Comparator>::Insert(const Key& key) {
+  // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+  // here since Insert() is externally synchronized.
+  Node* prev[kMaxHeight];
+  Node* x = FindGreaterOrEqual(key, prev);
+
+  // Our data structure does not allow duplicate insertion
+  assert(x == NULL || !Equal(key, x->key));
+
+  int height = RandomHeight();
+  if (height > GetMaxHeight()) {
+    for (int i = GetMaxHeight(); i < height; i++) {
+      prev[i] = head_;
+    }
+    //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
+
+    // It is ok to mutate max_height_ without any synchronization
+    // with concurrent readers.  A concurrent reader that observes
+    // the new value of max_height_ will see either the old value of
+    // new level pointers from head_ (NULL), or a new value set in
+    // the loop below.  In the former case the reader will
+    // immediately drop to the next level since NULL sorts after all
+    // keys.  In the latter case the reader will use the new node.
+    max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+  }
+
+  x = NewNode(key, height);
+  for (int i = 0; i < height; i++) {
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
+    prev[i]->SetNext(i, x);
+  }
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::Contains(const Key& key) const {
+  Node* x = FindGreaterOrEqual(key, NULL);
+  if (x != NULL && Equal(key, x->key)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@@ -0,0 +1,378 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/skiplist.h"
+#include <set>
+#include "include/env.h"
+#include "util/arena.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+typedef uint64_t Key;
+
+struct Comparator {
+  int operator()(const Key& a, const Key& b) const {
+    if (a < b) {
+      return -1;
+    } else if (a > b) {
+      return +1;
+    } else {
+      return 0;
+    }
+  }
+};
+
+class SkipTest { };
+
+TEST(SkipTest, Empty) {
+  Arena arena;
+  Comparator cmp;
+  SkipList<Key, Comparator> list(cmp, &arena);
+  ASSERT_TRUE(!list.Contains(10));
+
+  SkipList<Key, Comparator>::Iterator iter(&list);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToFirst();
+  ASSERT_TRUE(!iter.Valid());
+  iter.Seek(100);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToLast();
+  ASSERT_TRUE(!iter.Valid());
+}
+
+TEST(SkipTest, InsertAndLookup) {
+  const int N = 2000;
+  const int R = 5000;
+  Random rnd(1000);
+  std::set<Key> keys;
+  Arena arena;
+  Comparator cmp;
+  SkipList<Key, Comparator> list(cmp, &arena);
+  for (int i = 0; i < N; i++) {
+    Key key = rnd.Next() % R;
+    if (keys.insert(key).second) {
+      list.Insert(key);
+    }
+  }
+
+  for (int i = 0; i < R; i++) {
+    if (list.Contains(i)) {
+      ASSERT_EQ(keys.count(i), 1);
+    } else {
+      ASSERT_EQ(keys.count(i), 0);
+    }
+  }
+
+  // Simple iterator tests
+  {
+    SkipList<Key, Comparator>::Iterator iter(&list);
+    ASSERT_TRUE(!iter.Valid());
+
+    iter.Seek(0);
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToFirst();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToLast();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.rbegin()), iter.key());
+  }
+
+  // Forward iteration test
+  for (int i = 0; i < R; i++) {
+    SkipList<Key, Comparator>::Iterator iter(&list);
+    iter.Seek(i);
+
+    // Compare against model iterator
+    std::set<Key>::iterator model_iter = keys.lower_bound(i);
+    for (int j = 0; j < 3; j++) {
+      if (model_iter == keys.end()) {
+        ASSERT_TRUE(!iter.Valid());
+        break;
+      } else {
+        ASSERT_TRUE(iter.Valid());
+        ASSERT_EQ(*model_iter, iter.key());
+        ++model_iter;
+        iter.Next();
+      }
+    }
+  }
+
+  // Backward iteration test
+  {
+    SkipList<Key, Comparator>::Iterator iter(&list);
+    iter.SeekToLast();
+
+    // Compare against model iterator
+    for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
+         model_iter != keys.rend();
+         ++model_iter) {
+      ASSERT_TRUE(iter.Valid());
+      ASSERT_EQ(*model_iter, iter.key());
+      iter.Prev();
+    }
+    ASSERT_TRUE(!iter.Valid());
+  }
+}
+
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructor.  Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+//     <key,gen,hash>
+// where:
+//     key is in range [0..K-1]
+//     gen is a generation number for key
+//     hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key.  We then iterate, including random
+// calls to Next() and Seek().  For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ private:
+  static const uint32_t K = 4;
+
+  static uint64_t key(Key key) { return (key >> 40); }
+  static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+  static uint64_t hash(Key key) { return key & 0xff; }
+
+  static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+    uint64_t data[2] = { k, g };
+    return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+  }
+
+  static Key MakeKey(uint64_t k, uint64_t g) {
+    assert(sizeof(Key) == sizeof(uint64_t));
+    assert(k <= K);  // We sometimes pass K to seek to the end of the skiplist
+    assert(g <= 0xffffffffu);
+    return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+  }
+
+  static bool IsValidKey(Key k) {
+    return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+  }
+
+  static Key RandomTarget(Random* rnd) {
+    switch (rnd->Next() % 10) {
+      case 0:
+        // Seek to beginning
+        return MakeKey(0, 0);
+      case 1:
+        // Seek to end
+        return MakeKey(K, 0);
+      default:
+        // Seek to middle
+        return MakeKey(rnd->Next() % K, 0);
+    }
+  }
+
+  // Per-key generation
+  struct State {
+    port::AtomicPointer generation[K];
+    void Set(int k, intptr_t v) {
+      generation[k].Release_Store(reinterpret_cast<void*>(v));
+    }
+    intptr_t Get(int k) {
+      return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+    }
+
+    State() {
+      for (int k = 0; k < K; k++) {
+        Set(k, 0);
+      }
+    }
+  };
+
+  // Current state of the test
+  State current_;
+
+  Arena arena_;
+
+  // SkipList is not protected by mu_.  We just use a single writer
+  // thread to modify it.
+  SkipList<Key, Comparator> list_;
+
+ public:
+  ConcurrentTest() : list_(Comparator(), &arena_) { }
+
+  // REQUIRES: External synchronization
+  void WriteStep(Random* rnd) {
+    const uint32_t k = rnd->Next() % K;
+    const intptr_t g = current_.Get(k) + 1;
+    const Key key = MakeKey(k, g);
+    list_.Insert(key);
+    current_.Set(k, g);
+  }
+
+  void ReadStep(Random* rnd) {
+    // Remember the initial committed state of the skiplist.
+    State initial_state;
+    for (int k = 0; k < K; k++) {
+      initial_state.Set(k, current_.Get(k));
+    }
+
+    Key pos = RandomTarget(rnd);
+    SkipList<Key, Comparator>::Iterator iter(&list_);
+    iter.Seek(pos);
+    while (true) {
+      Key current;
+      if (!iter.Valid()) {
+        current = MakeKey(K, 0);
+      } else {
+        current = iter.key();
+        ASSERT_TRUE(IsValidKey(current)) << std::hex << current;
+      }
+      ASSERT_LE(pos, current) << "should not go backwards";
+
+      // Verify that everything in [pos,current) was not present in
+      // initial_state.
+      while (pos < current) {
+        ASSERT_LT(key(pos), K) << std::hex << pos;
+
+        // Note that generation 0 is never inserted, so it is ok if
+        // <*,0,*> is missing.
+        ASSERT_TRUE((gen(pos) == 0) ||
+                    (gen(pos) > initial_state.Get(key(pos)))
+                    ) << "key: " << key(pos)
+                      << "; gen: " << gen(pos)
+                      << "; initgen: "
+                      << initial_state.Get(key(pos));
+
+        // Advance to next key in the valid key space
+        if (key(pos) < key(current)) {
+          pos = MakeKey(key(pos) + 1, 0);
+        } else {
+          pos = MakeKey(key(pos), gen(pos) + 1);
+        }
+      }
+
+      if (!iter.Valid()) {
+        break;
+      }
+
+      if (rnd->Next() % 2) {
+        iter.Next();
+        pos = MakeKey(key(pos), gen(pos) + 1);
+      } else {
+        Key new_target = RandomTarget(rnd);
+        if (new_target > pos) {
+          pos = new_target;
+          iter.Seek(new_target);
+        }
+      }
+    }
+  }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST(SkipTest, ConcurrentWithoutThreads) {
+  ConcurrentTest test;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 10000; i++) {
+    test.ReadStep(&rnd);
+    test.WriteStep(&rnd);
+  }
+}
+
+class TestState {
+ public:
+  ConcurrentTest t_;
+  int seed_;
+  port::AtomicPointer quit_flag_;
+
+  enum ReaderState {
+    STARTING,
+    RUNNING,
+    DONE
+  };
+
+  explicit TestState(int s)
+      : seed_(s),
+        quit_flag_(NULL),
+        state_(STARTING),
+        state_cv_(&mu_) {}
+
+  void Wait(ReaderState s) {
+    mu_.Lock();
+    while (state_ != s) {
+      state_cv_.Wait();
+    }
+    mu_.Unlock();
+  }
+
+  void Change(ReaderState s) {
+    mu_.Lock();
+    state_ = s;
+    state_cv_.Signal();
+    mu_.Unlock();
+  }
+
+ private:
+  port::Mutex mu_;
+  ReaderState state_;
+  port::CondVar state_cv_;
+};
+
+static void ConcurrentReader(void* arg) {
+  TestState* state = reinterpret_cast<TestState*>(arg);
+  Random rnd(state->seed_);
+  int64_t reads = 0;
+  state->Change(TestState::RUNNING);
+  while (!state->quit_flag_.Acquire_Load()) {
+    state->t_.ReadStep(&rnd);
+    ++reads;
+  }
+  state->Change(TestState::DONE);
+}
+
+static void RunConcurrent(int run) {
+  const int seed = test::RandomSeed() + (run * 100);
+  Random rnd(seed);
+  const int N = 1000;
+  const int kSize = 1000;
+  for (int i = 0; i < N; i++) {
+    if ((i % 100) == 0) {
+      fprintf(stderr, "Run %d of %d\n", i, N);
+    }
+    TestState state(seed + 1);
+    Env::Default()->Schedule(ConcurrentReader, &state);
+    state.Wait(TestState::RUNNING);
+    for (int i = 0; i < kSize; i++) {
+      state.t_.WriteStep(&rnd);
+    }
+    state.quit_flag_.Release_Store(&state);  // Any non-NULL arg will do
+    state.Wait(TestState::DONE);
+  }
+}
+
+TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
+
+}
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/db/snapshot.h
+++ b/db/snapshot.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
+#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
+
+#include "include/db.h"
+
+namespace leveldb {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each Snapshot corresponds to a particular sequence number.
+class Snapshot {
+ public:
+  SequenceNumber number_;  // const after creation
+
+ private:
+  friend class SnapshotList;
+
+  // Snapshot is kept in a doubly-linked circular list
+  Snapshot* prev_;
+  Snapshot* next_;
+
+  SnapshotList* list_;                 // just for sanity checks
+};
+
+class SnapshotList {
+ public:
+  SnapshotList() {
+    list_.prev_ = &list_;
+    list_.next_ = &list_;
+  }
+
+  bool empty() const { return list_.next_ == &list_; }
+  Snapshot* oldest() const { assert(!empty()); return list_.next_; }
+  Snapshot* newest() const { assert(!empty()); return list_.prev_; }
+
+  const Snapshot* New(SequenceNumber seq) {
+    Snapshot* s = new Snapshot;
+    s->number_ = seq;
+    s->list_ = this;
+    s->next_ = &list_;
+    s->prev_ = list_.prev_;
+    s->prev_->next_ = s;
+    s->next_->prev_ = s;
+    return s;
+  }
+
+  void Delete(const Snapshot* s) {
+    assert(s->list_ == this);
+    s->prev_->next_ = s->next_;
+    s->next_->prev_ = s->prev_;
+    delete s;
+  }
+
+ private:
+  // Dummy head of doubly-linked list of snapshots
+  Snapshot list_;
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_DB_SNAPSHOT_H_
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/filename.h"
+#include "include/env.h"
+#include "include/table.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+struct TableAndFile {
+  RandomAccessFile* file;
+  Table* table;
+};
+
+static void DeleteEntry(const Slice& key, void* value) {
+  TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
+  delete tf->table;
+  delete tf->file;
+  delete tf;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+  Cache* cache = reinterpret_cast<Cache*>(arg1);
+  Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+  cache->Release(h);
+}
+
+TableCache::TableCache(const std::string& dbname,
+                       const Options* options,
+                       int entries)
+    : env_(options->env),
+      dbname_(dbname),
+      options_(options),
+      cache_(NewLRUCache(entries)) {
+}
+
+TableCache::~TableCache() {
+  delete cache_;
+}
+
+Iterator* TableCache::NewIterator(const ReadOptions& options,
+                                  uint64_t file_number,
+                                  Table** tableptr) {
+  if (tableptr != NULL) {
+    *tableptr = NULL;
+  }
+
+  char buf[sizeof(file_number)];
+  EncodeFixed64(buf, file_number);
+  Slice key(buf, sizeof(buf));
+  Cache::Handle* handle = cache_->Lookup(key);
+  if (handle == NULL) {
+    std::string fname = TableFileName(dbname_, file_number);
+    RandomAccessFile* file = NULL;
+    Table* table = NULL;
+    Status s = env_->NewRandomAccessFile(fname, &file);
+    if (s.ok()) {
+      s = Table::Open(*options_, file, &table);
+    }
+
+    if (!s.ok()) {
+      assert(table == NULL);
+      delete file;
+      // We do not cache error results so that if the error is transient,
+      // or somebody repairs the file, we recover automatically.
+      return NewErrorIterator(s);
+    }
+
+    TableAndFile* tf = new TableAndFile;
+    tf->file = file;
+    tf->table = table;
+    handle = cache_->Insert(key, tf, 1, &DeleteEntry);
+  }
+
+  Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
+  Iterator* result = table->NewIterator(options);
+  result->RegisterCleanup(&UnrefEntry, cache_, handle);
+  if (tableptr != NULL) {
+    *tableptr = table;
+  }
+  return result;
+}
+
+void TableCache::Evict(uint64_t file_number) {
+  char buf[sizeof(file_number)];
+  EncodeFixed64(buf, file_number);
+  cache_->Erase(Slice(buf, sizeof(buf)));
+}
+
+}
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_
+#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_
+
+#include <string>
+#include <stdint.h>
+#include "db/dbformat.h"
+#include "include/cache.h"
+#include "include/table.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class Env;
+
+class TableCache {
+ public:
+  TableCache(const std::string& dbname, const Options* options, int entries);
+  ~TableCache();
+
+  // Get an iterator for the specified file number and return it.  If
+  // "tableptr" is non-NULL, also sets "*tableptr" to point to the
+  // Table object underlying the returned iterator, or NULL if no
+  // Table object underlies the returned iterator.  The returned
+  // "*tableptr" object is owned by the cache and should not be
+  // deleted, and is valid for as long as the returned iterator is
+  // live.
+  Iterator* NewIterator(const ReadOptions& options,
+                        uint64_t file_number,
+                        Table** tableptr = NULL);
+
+  // Evict any entry for the specified file number
+  void Evict(uint64_t file_number);
+
+ private:
+  Env* const env_;
+  const std::string dbname_;
+  const Options* options_;
+  Cache* cache_;
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -0,0 +1,282 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/version_set.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed.
+enum Tag {
+  kComparator           = 1,
+  kLogNumber            = 2,
+  kNextFileNumber       = 3,
+  kLastSequence         = 4,
+  kCompactPointer       = 5,
+  kDeletedFile          = 6,
+  kNewFile              = 7,
+  kLargeValueRef        = 8,
+};
+
+void VersionEdit::Clear() {
+  comparator_.clear();
+  log_number_ = 0;
+  last_sequence_ = 0;
+  next_file_number_ = 0;
+  has_comparator_ = false;
+  has_log_number_ = false;
+  has_next_file_number_ = false;
+  has_last_sequence_ = false;
+  deleted_files_.clear();
+  new_files_.clear();
+  large_refs_added_.clear();
+}
+
+void VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_comparator_) {
+    PutVarint32(dst, kComparator);
+    PutLengthPrefixedSlice(dst, comparator_);
+  }
+  if (has_log_number_) {
+    PutVarint32(dst, kLogNumber);
+    PutVarint64(dst, log_number_);
+  }
+  if (has_next_file_number_) {
+    PutVarint32(dst, kNextFileNumber);
+    PutVarint64(dst, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    PutVarint32(dst, kLastSequence);
+    PutVarint64(dst, last_sequence_);
+  }
+
+  for (int i = 0; i < compact_pointers_.size(); i++) {
+    PutVarint32(dst, kCompactPointer);
+    PutVarint32(dst, compact_pointers_[i].first);  // level
+    PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
+  }
+
+  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+       iter != deleted_files_.end();
+       ++iter) {
+    PutVarint32(dst, kDeletedFile);
+    PutVarint32(dst, iter->first);   // level
+    PutVarint64(dst, iter->second);  // file number
+  }
+
+  for (int i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    PutVarint32(dst, kNewFile);
+    PutVarint32(dst, new_files_[i].first);  // level
+    PutVarint64(dst, f.number);
+    PutVarint64(dst, f.file_size);
+    PutLengthPrefixedSlice(dst, f.smallest.Encode());
+    PutLengthPrefixedSlice(dst, f.largest.Encode());
+  }
+
+  for (int i = 0; i < large_refs_added_.size(); i++) {
+    const VersionEdit::Large& l = large_refs_added_[i];
+    PutVarint32(dst, kLargeValueRef);
+    PutLengthPrefixedSlice(dst,
+                           Slice(l.large_ref.data, LargeValueRef::ByteSize()));
+    PutVarint64(dst, l.fnum);
+    PutLengthPrefixedSlice(dst, l.internal_key.Encode());
+  }
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+  Slice str;
+  if (GetLengthPrefixedSlice(input, &str)) {
+    dst->DecodeFrom(str);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static bool GetLevel(Slice* input, int* level) {
+  uint32_t v;
+  if (GetVarint32(input, &v) &&
+      v < config::kNumLevels) {
+    *level = v;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+  Clear();
+  Slice input = src;
+  const char* msg = NULL;
+  uint32_t tag;
+
+  // Temporary storage for parsing
+  int level;
+  uint64_t number;
+  FileMetaData f;
+  Slice str;
+  Large large;
+  InternalKey key;
+
+  while (msg == NULL && GetVarint32(&input, &tag)) {
+    switch (tag) {
+      case kComparator:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          comparator_ = str.ToString();
+          has_comparator_ = true;
+        } else {
+          msg = "comparator name";
+        }
+        break;
+
+      case kLogNumber:
+        if (GetVarint64(&input, &log_number_)) {
+          has_log_number_ = true;
+        } else {
+          msg = "log number";
+        }
+        break;
+
+      case kNextFileNumber:
+        if (GetVarint64(&input, &next_file_number_)) {
+          has_next_file_number_ = true;
+        } else {
+          msg = "next file number";
+        }
+        break;
+
+      case kLastSequence:
+        if (GetVarint64(&input, &last_sequence_)) {
+          has_last_sequence_ = true;
+        } else {
+          msg = "last sequence number";
+        }
+        break;
+
+      case kCompactPointer:
+        if (GetLevel(&input, &level) &&
+            GetInternalKey(&input, &key)) {
+          compact_pointers_.push_back(std::make_pair(level, key));
+        } else {
+          msg = "compaction pointer";
+        }
+        break;
+
+      case kDeletedFile:
+        if (GetLevel(&input, &level) &&
+            GetVarint64(&input, &number)) {
+          deleted_files_.insert(std::make_pair(level, number));
+        } else {
+          msg = "deleted file";
+        }
+        break;
+
+      case kNewFile:
+        if (GetLevel(&input, &level) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest)) {
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          msg = "new-file entry";
+        }
+        break;
+
+      case kLargeValueRef:
+        if (GetLengthPrefixedSlice(&input, &str) &&
+            (str.size() == LargeValueRef::ByteSize()) &&
+            GetVarint64(&input, &large.fnum) &&
+            GetInternalKey(&input, &large.internal_key)) {
+          large.large_ref = LargeValueRef::FromRef(str);
+          large_refs_added_.push_back(large);
+        } else {
+          msg = "large ref";
+        }
+        break;
+
+      default:
+        msg = "unknown tag";
+        break;
+    }
+  }
+
+  if (msg == NULL && !input.empty()) {
+    msg = "invalid tag";
+  }
+
+  Status result;
+  if (msg != NULL) {
+    result = Status::Corruption("VersionEdit", msg);
+  }
+  return result;
+}
+
+std::string VersionEdit::DebugString() const {
+  std::string r;
+  r.append("VersionEdit {");
+  if (has_comparator_) {
+    r.append("\n  Comparator: ");
+    r.append(comparator_);
+  }
+  if (has_log_number_) {
+    r.append("\n  LogNumber: ");
+    AppendNumberTo(&r, log_number_);
+  }
+  if (has_next_file_number_) {
+    r.append("\n  NextFile: ");
+    AppendNumberTo(&r, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    r.append("\n  LastSeq: ");
+    AppendNumberTo(&r, last_sequence_);
+  }
+  for (int i = 0; i < compact_pointers_.size(); i++) {
+    r.append("\n  CompactPointer: ");
+    AppendNumberTo(&r, compact_pointers_[i].first);
+    r.append(" '");
+    AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode());
+    r.append("'");
+  }
+  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+       iter != deleted_files_.end();
+       ++iter) {
+    r.append("\n  DeleteFile: ");
+    AppendNumberTo(&r, iter->first);
+    r.append(" ");
+    AppendNumberTo(&r, iter->second);
+  }
+  for (int i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    r.append("\n  AddFile: ");
+    AppendNumberTo(&r, new_files_[i].first);
+    r.append(" ");
+    AppendNumberTo(&r, f.number);
+    r.append(" ");
+    AppendNumberTo(&r, f.file_size);
+    r.append(" '");
+    AppendEscapedStringTo(&r, f.smallest.Encode());
+    r.append("' .. '");
+    AppendEscapedStringTo(&r, f.largest.Encode());
+    r.append("'");
+  }
+  for (int i = 0; i < large_refs_added_.size(); i++) {
+    const VersionEdit::Large& l = large_refs_added_[i];
+    r.append("\n  LargeRef: ");
+    AppendNumberTo(&r, l.fnum);
+    r.append(" ");
+    r.append(LargeValueRefToFilenameString(l.large_ref));
+    r.append(" '");
+    AppendEscapedStringTo(&r, l.internal_key.Encode());
+    r.append("'");
+  }
+  r.append("\n}\n");
+  return r;
+}
+
+}
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_
+#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_
+
+#include <set>
+#include <utility>
+#include <vector>
+#include "db/dbformat.h"
+
+namespace leveldb {
+
+class VersionSet;
+
+struct FileMetaData {
+  int refs;
+  uint64_t number;
+  uint64_t file_size;         // File size in bytes
+  InternalKey smallest;       // Smallest internal key served by table
+  InternalKey largest;        // Largest internal key served by table
+
+  FileMetaData() : refs(0), file_size(0) { }
+};
+
+class VersionEdit {
+ public:
+  VersionEdit() { Clear(); }
+  ~VersionEdit() { }
+
+  void Clear();
+
+  void SetComparatorName(const Slice& name) {
+    has_comparator_ = true;
+    comparator_ = name.ToString();
+  }
+  void SetLogNumber(uint64_t num) {
+    has_log_number_ = true;
+    log_number_ = num;
+  }
+  void SetNextFile(uint64_t num) {
+    has_next_file_number_ = true;
+    next_file_number_ = num;
+  }
+  void SetLastSequence(SequenceNumber seq) {
+    has_last_sequence_ = true;
+    last_sequence_ = seq;
+  }
+  void SetCompactPointer(int level, const InternalKey& key) {
+    compact_pointers_.push_back(std::make_pair(level, key));
+  }
+
+  // Add the specified file at the specified number.
+  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  void AddFile(int level, uint64_t file,
+               uint64_t file_size,
+               const InternalKey& smallest,
+               const InternalKey& largest) {
+    FileMetaData f;
+    f.number = file;
+    f.file_size = file_size;
+    f.smallest = smallest;
+    f.largest = largest;
+    new_files_.push_back(std::make_pair(level, f));
+  }
+
+  // Delete the specified "file" from the specified "level".
+  void DeleteFile(int level, uint64_t file) {
+    deleted_files_.insert(std::make_pair(level, file));
+  }
+
+  // Record that a large value with the specified large_ref was
+  // written to the output file numbered "fnum"
+  void AddLargeValueRef(const LargeValueRef& large_ref,
+                        uint64_t fnum,
+                        const Slice& internal_key) {
+    large_refs_added_.resize(large_refs_added_.size() + 1);
+    Large* large = &(large_refs_added_.back());
+    large->large_ref = large_ref;
+    large->fnum = fnum;
+    large->internal_key.DecodeFrom(internal_key);
+  }
+
+  void EncodeTo(std::string* dst) const;
+  Status DecodeFrom(const Slice& src);
+
+  std::string DebugString() const;
+
+ private:
+  friend class VersionSet;
+
+  typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
+
+  std::string comparator_;
+  uint64_t log_number_;
+  uint64_t next_file_number_;
+  SequenceNumber last_sequence_;
+  bool has_comparator_;
+  bool has_log_number_;
+  bool has_next_file_number_;
+  bool has_last_sequence_;
+
+  std::vector< std::pair<int, InternalKey> > compact_pointers_;
+  DeletedFileSet deleted_files_;
+  std::vector< std::pair<int, FileMetaData> > new_files_;
+  struct Large {
+    LargeValueRef large_ref;
+    uint64_t fnum;
+    InternalKey internal_key;
+  };
+  std::vector<Large> large_refs_added_;
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_DB_VERSION_EDIT_H_
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  parsed.EncodeTo(&encoded2);
+  ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest { };
+
+TEST(VersionEditTest, EncodeDecode) {
+  static const uint64_t kBig = 1ull << 50;
+
+  VersionEdit edit;
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit);
+    edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
+                 InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef),
+                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
+    edit.DeleteFile(4, kBig + 700 + i);
+    edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression),
+                          kBig + 800 + i, "foobar");
+    edit.AddLargeValueRef(LargeValueRef::Make("big2", kLightweightCompression),
+                          kBig + 801 + i, "baz");
+    edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+}
+
+}
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/db/version_set.cc
+++ b/db/version_set.cc
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -0,0 +1,290 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions.  The
+// newest version is called "current".  Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level.  The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
+#define STORAGE_LEVELDB_DB_VERSION_SET_H_
+
+#include <map>
+#include <set>
+#include <vector>
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+// Grouping of constants.  We may want to make some of these
+// parameters set via options.
+namespace config {
+static const int kNumLevels = 7;
+}
+
+namespace log { class Writer; }
+
+class Compaction;
+class Iterator;
+class MemTable;
+class TableBuilder;
+class TableCache;
+class Version;
+class VersionSet;
+class WritableFile;
+
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  void Unref();
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString() const;
+
+ private:
+  friend class Compaction;
+  friend class VersionSet;
+
+  class LevelFileNumIterator;
+  Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
+
+  VersionSet* vset_;            // VersionSet to which this Version belongs
+  Version* next_;               // Next version in linked list
+  int refs_;                    // Number of live refs to this version
+  MemTable* cleanup_mem_;       // NULL, or table to delete when version dropped
+
+  // List of files per level
+  std::vector<FileMetaData*> files_[config::kNumLevels];
+
+  // Level that should be compacted next and its compaction score.
+  // Score < 1 means compaction is not strictly needed.  These fields
+  // are initialized by Finalize().
+  double compaction_score_;
+  int compaction_level_;
+
+  explicit Version(VersionSet* vset)
+      : vset_(vset), next_(NULL), refs_(0),
+        cleanup_mem_(NULL),
+        compaction_score_(-1),
+        compaction_level_(-1) {
+  }
+
+  ~Version();
+
+  // No copying allowed
+  Version(const Version&);
+  void operator=(const Version&);
+};
+
+class VersionSet {
+ public:
+  VersionSet(const std::string& dbname,
+             const Options* options,
+             TableCache* table_cache,
+             const InternalKeyComparator*);
+  ~VersionSet();
+
+  // Apply *edit to the current version to form a new descriptor that
+  // is both saved to persistent state and installed as the new
+  // current version.  Iff Apply() returns OK, arrange to delete
+  // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed
+  // by older versions.
+  Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem);
+
+  // Recover the last saved descriptor from persistent storage.
+  Status Recover(uint64_t* log_number, SequenceNumber* last_sequence);
+
+  // Save current contents to *log
+  Status WriteSnapshot(log::Writer* log);
+
+  // Return the current version.
+  Version* current() const { return current_; }
+
+  // Return the current manifest file number
+  uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+
+  // Allocate and return a new file number
+  uint64_t NewFileNumber() { return next_file_number_++; }
+
+  // Return the number of Table files at the specified level.
+  int NumLevelFiles(int level) const;
+
+  // Pick level and inputs for a new compaction.
+  // Returns NULL if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  Compaction* PickCompaction();
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns NULL if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  Compaction* CompactRange(
+      int level,
+      const InternalKey& begin,
+      const InternalKey& end);
+
+  // Create an iterator that reads over the compaction inputs for "*c".
+  // The caller should delete the iterator when no longer needed.
+  Iterator* MakeInputIterator(Compaction* c);
+
+  // Returns true iff some level needs a compaction.
+  bool NeedsCompaction() const { return current_->compaction_score_ >= 1; }
+
+  // Add all files listed in any live version to *live.
+  // May also mutate some internal state.
+  void AddLiveFiles(std::set<uint64_t>* live);
+
+  // Return the approximate offset in the database of the data for
+  // "key" as of version "v".
+  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+
+  // Register a reference to a large value with the specified
+  // large_ref from the specified file number.  Returns "true" if this
+  // is the first recorded reference to the "large_ref" value in the
+  // database, and false otherwise.
+  bool RegisterLargeValueRef(const LargeValueRef& large_ref,
+                             uint64_t filenum,
+                             const InternalKey& internal_key);
+
+  // Cleanup the large value reference state by eliminating any
+  // references from files that are not includes in either "live_tables"
+  // or "log_file".
+  void CleanupLargeValueRefs(const std::set<uint64_t>& live_tables,
+                             uint64_t log_file_num);
+
+  // Returns true if a large value with the given reference is live.
+  bool LargeValueIsLive(const LargeValueRef& large_ref);
+
+ private:
+  class Builder;
+
+  friend class Compaction;
+  friend class Version;
+
+  Status Finalize(Version* v);
+
+  // Delete any old versions that are no longer needed.
+  void MaybeDeleteOldVersions();
+
+  struct BySmallestKey;
+  Status SortLevel(Version* v, uint64_t level);
+
+  void GetOverlappingInputs(
+      int level,
+      const InternalKey& begin,
+      const InternalKey& end,
+      std::vector<FileMetaData*>* inputs);
+
+  void GetRange(const std::vector<FileMetaData*>& inputs,
+                InternalKey* smallest,
+                InternalKey* largest);
+
+  Env* const env_;
+  const std::string dbname_;
+  const Options* const options_;
+  TableCache* const table_cache_;
+  const InternalKeyComparator icmp_;
+  uint64_t next_file_number_;
+  uint64_t manifest_file_number_;
+
+  // Opened lazily
+  WritableFile* descriptor_file_;
+  log::Writer* descriptor_log_;
+
+  // Versions are kept in a singly linked list that is never empty
+  Version* current_;    // Pointer to the last (newest) list entry
+  Version* oldest_;     // Pointer to the first (oldest) list entry
+
+  // Map from large value reference to the set of <file numbers,internal_key>
+  // values containing references to the value.  We keep the
+  // internal key as a std::string rather than as an InternalKey because
+  // we want to be able to easily use a set.
+  typedef std::set<std::pair<uint64_t, std::string> > LargeReferencesSet;
+  typedef std::map<LargeValueRef, LargeReferencesSet> LargeValueMap;
+  LargeValueMap large_value_refs_;
+
+  // Per-level key at which the next compaction at that level should start.
+  // Either an empty string, or a valid InternalKey.
+  std::string compact_pointer_[config::kNumLevels];
+
+  // No copying allowed
+  VersionSet(const VersionSet&);
+  void operator=(const VersionSet&);
+};
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+  ~Compaction();
+
+  // Return the level that is being compacted.  Inputs from "level"
+  // and "level+1" will be merged to produce a set of "level+1" files.
+  int level() const { return level_; }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return &edit_; }
+
+  // "which" must be either 0 or 1
+  int num_input_files(int which) const { return inputs_[which].size(); }
+
+  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the information we have available guarantees that
+  // the compaction is producing data in "level+1" for which no data exists
+  // in levels greater than "level+1".
+  bool IsBaseLevelForKey(const Slice& user_key);
+
+  // Release the input version for the compaction, once the compaction
+  // is successful.
+  void ReleaseInputs();
+
+ private:
+  friend class Version;
+  friend class VersionSet;
+
+  explicit Compaction(int level);
+
+  int level_;
+  uint64_t max_output_file_size_;
+  Version* input_version_;
+  VersionEdit edit_;
+
+  // Each compaction reads inputs from "level_" and "level_+1"
+  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+
+  // State for implementing IsBaseLevelForKey
+
+  // level_ptrs_ holds indices into input_version_->levels_: our state
+  // is that we are positioned at one of the file ranges for each
+  // higher level than the ones involved in this compaction (i.e. for
+  // all L >= level_ + 2).
+  int level_ptrs_[config::kNumLevels];
+};
+
+}
+
+#endif  // STORAGE_LEVELDB_DB_VERSION_SET_H_
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -0,0 +1,164 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+//    sequence: fixed64
+//    count: fixed32
+//    data: record[count]
+// record :=
+//    kTypeValue varstring varstring         |
+//    kTypeLargeValueRef varstring varstring |
+//    kTypeDeletion varstring
+// varstring :=
+//    len: varint32
+//    data: uint8[len]
+
+#include "include/write_batch.h"
+
+#include "include/db.h"
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+WriteBatch::WriteBatch() {
+  Clear();
+}
+
+WriteBatch::~WriteBatch() { }
+
+void WriteBatch::Clear() {
+  rep_.clear();
+  rep_.resize(12);
+}
+
+int WriteBatchInternal::Count(const WriteBatch* b) {
+  return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+  EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+  return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+  EncodeFixed64(&b->rep_[0], seq);
+}
+
+void WriteBatch::Put(const Slice& key, const Slice& value) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeValue));
+  PutLengthPrefixedSlice(&rep_, key);
+  PutLengthPrefixedSlice(&rep_, value);
+}
+
+void WriteBatchInternal::PutLargeValueRef(WriteBatch* b,
+                                          const Slice& key,
+                                          const LargeValueRef& large_ref) {
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  b->rep_.push_back(static_cast<char>(kTypeLargeValueRef));
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_,
+                         Slice(large_ref.data, sizeof(large_ref.data)));
+}
+
+void WriteBatch::Delete(const Slice& key) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeDeletion));
+  PutLengthPrefixedSlice(&rep_, key);
+}
+
+Status WriteBatchInternal::InsertInto(const WriteBatch* b,
+                                      MemTable* memtable) {
+  const int count = WriteBatchInternal::Count(b);
+  int found = 0;
+  Iterator it(*b);
+  for (; !it.Done(); it.Next()) {
+    switch (it.op()) {
+      case kTypeDeletion:
+        memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice());
+        break;
+      case kTypeValue:
+        memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value());
+        break;
+      case kTypeLargeValueRef:
+        memtable->Add(it.sequence_number(), kTypeLargeValueRef,
+                      it.key(), it.value());
+        break;
+    }
+    found++;
+  }
+  if (!it.status().ok()) {
+    return it.status();
+  } else if (found != count) {
+    return Status::Corruption("wrong count in WriteBatch");
+  }
+  return Status::OK();
+}
+
+void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+  assert(contents.size() >= 12);
+  b->rep_.assign(contents.data(), contents.size());
+}
+
+WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch)
+    : input_(WriteBatchInternal::Contents(&batch)),
+      done_(false) {
+  if (input_.size() < 12) {
+    done_ = true;
+  } else {
+    seq_ = WriteBatchInternal::Sequence(&batch),
+    input_.remove_prefix(12);
+    GetNextEntry();
+  }
+}
+
+void WriteBatchInternal::Iterator::Next() {
+  assert(!done_);
+  seq_++;
+  GetNextEntry();
+}
+
+void WriteBatchInternal::Iterator::GetNextEntry() {
+  if (input_.empty()) {
+    done_ = true;
+    return;
+  }
+  char tag = input_[0];
+  input_.remove_prefix(1);
+  switch (tag) {
+    case kTypeValue:
+    case kTypeLargeValueRef:
+      if (GetLengthPrefixedSlice(&input_, &key_) &&
+          GetLengthPrefixedSlice(&input_, &value_)) {
+        op_ = static_cast<ValueType>(tag);
+      } else {
+        status_ = Status::Corruption("bad WriteBatch Put");
+        done_ = true;
+        input_.clear();
+      }
+      break;
+    case kTypeDeletion:
+      if (GetLengthPrefixedSlice(&input_, &key_)) {
+        op_ = kTypeDeletion;
+      } else {
+        status_ = Status::Corruption("bad WriteBatch Delete");
+        done_ = true;
+        input_.clear();
+      }
+      break;
+    default:
+      status_ = Status::Corruption("unknown WriteBatch tag");
+      done_ = true;
+      input_.clear();
+      break;
+  }
+}
+
+}
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+
+#include "include/write_batch.h"
+
+namespace leveldb {
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+  static void PutLargeValueRef(WriteBatch* batch,
+                               const Slice& key,
+                               const LargeValueRef& large_ref);
+
+  // Return the number of entries in the batch.
+  static int Count(const WriteBatch* batch);
+
+  // Set the count for the number of entries in the batch.
+  static void SetCount(WriteBatch* batch, int n);
+
+  // Return the seqeunce number for the start of this batch.
+  static SequenceNumber Sequence(const WriteBatch* batch);
+
+  // Store the specified number as the seqeunce number for the start of
+  // this batch.
+  static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+  static Slice Contents(const WriteBatch* batch) {
+    return Slice(batch->rep_);
+  }
+
+  static size_t ByteSize(const WriteBatch* batch) {
+    return batch->rep_.size();
+  }
+
+  static void SetContents(WriteBatch* batch, const Slice& contents);
+
+  static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
+
+  // Iterate over the contents of a write batch.
+  class Iterator {
+   public:
+    explicit Iterator(const WriteBatch& batch);
+    bool Done() const { return done_; }
+    void Next();
+    ValueType op() const { return op_; }
+    const Slice& key() const { return key_; }
+    const Slice& value() const { return value_; }
+    SequenceNumber sequence_number() const { return seq_; }
+    Status status() const { return status_; }
+
+   private:
+    void GetNextEntry();
+
+    Slice input_;
+    bool done_;
+    ValueType op_;
+    Slice key_;
+    Slice value_;
+    SequenceNumber seq_;
+    Status status_;
+  };
+};
+
+}
+
+
+#endif  // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/db.h"
+
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "include/env.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static std::string PrintContents(WriteBatch* b) {
+  InternalKeyComparator cmp(BytewiseComparator());
+  MemTable mem(cmp);
+  std::string state;
+  Status s = WriteBatchInternal::InsertInto(b, &mem);
+  Iterator* iter = mem.NewIterator();
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey ikey;
+    ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+    switch (ikey.type) {
+      case kTypeValue:
+        state.append("Put(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        break;
+      case kTypeLargeValueRef:
+        state.append("PutRef(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        break;
+      case kTypeDeletion:
+        state.append("Delete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        break;
+    }
+    state.append("@");
+    state.append(NumberToString(ikey.sequence));
+  }
+  delete iter;
+  if (!s.ok()) {
+    state.append("ParseError()");
+  }
+  return state;
+}
+
+class WriteBatchTest { };
+
+TEST(WriteBatchTest, Empty) {
+  WriteBatch batch;
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
+}
+
+TEST(WriteBatchTest, Multiple) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  batch.Put(Slice("baz"), Slice("boo"));
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ("Put(baz, boo)@102"
+            "Delete(box)@101"
+            "Put(foo, bar)@100",
+            PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, PutIndirect) {
+  WriteBatch batch;
+  batch.Put(Slice("baz"), Slice("boo"));
+  LargeValueRef h;
+  for (int i = 0; i < LargeValueRef::ByteSize(); i++) {
+    h.data[i] = (i < 20) ? 'a' : 'b';
+  }
+  WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h);
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(2, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ("Put(baz, boo)@100"
+            "PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101",
+            PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Corruption) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  WriteBatchInternal::SetSequence(&batch, 200);
+  Slice contents = WriteBatchInternal::Contents(&batch);
+  WriteBatchInternal::SetContents(&batch,
+                                  Slice(contents.data(),contents.size()-1));
+  ASSERT_EQ("Put(foo, bar)@200"
+            "ParseError()",
+            PrintContents(&batch));
+}
+
+}
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}