Upgrade LevelDB from 1.12 to 1.14:

* Fix issues 77, 87, 182, 190, 200, and 201 * Fix bug described in https://groups.google.com/d/msg/leveldb/yL6h1mAOc20/vLU64RylIdMJ * Fix a bug where options.max_open_files was not getting clamped properly. * Fix link to bigtable paper in docs. * New sstables will have the file extension .ldb. .sst files will continue to be recognized.
2025-12-06 17:27:52 +00:00 · 2013-08-21 11:12:47 -07:00
parent 474b824902
commit 97346c6618
23 changed files with 498 additions and 78 deletions
--- a/db/autocompact_test.cc
+++ b/db/autocompact_test.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+#include "db/db_impl.h"
+#include "leveldb/cache.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+class AutoCompactTest {
+ public:
+  std::string dbname_;
+  Cache* tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  AutoCompactTest() {
+    dbname_ = test::TmpDir() + "/autocompact_test";
+    tiny_cache_ = NewLRUCache(100);
+    options_.block_cache = tiny_cache_;
+    DestroyDB(dbname_, options_);
+    options_.create_if_missing = true;
+    options_.compression = kNoCompression;
+    ASSERT_OK(DB::Open(options_, dbname_, &db_));
+  }
+
+  ~AutoCompactTest() {
+    delete db_;
+    DestroyDB(dbname_, Options());
+    delete tiny_cache_;
+  }
+
+  std::string Key(int i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "key%06d", i);
+    return std::string(buf);
+  }
+
+  uint64_t Size(const Slice& start, const Slice& limit) {
+    Range r(start, limit);
+    uint64_t size;
+    db_->GetApproximateSizes(&r, 1, &size);
+    return size;
+  }
+
+  void DoReads(int n);
+};
+
+static const int kValueSize = 200 * 1024;
+static const int kTotalSize = 100 * 1024 * 1024;
+static const int kCount = kTotalSize / kValueSize;
+
+// Read through the first n keys repeatedly and check that they get
+// compacted (verified by checking the size of the key space).
+void AutoCompactTest::DoReads(int n) {
+  std::string value(kValueSize, 'x');
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+  // Fill database
+  for (int i = 0; i < kCount; i++) {
+    ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
+  }
+  ASSERT_OK(dbi->TEST_CompactMemTable());
+
+  // Delete everything
+  for (int i = 0; i < kCount; i++) {
+    ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
+  }
+  ASSERT_OK(dbi->TEST_CompactMemTable());
+
+  // Get initial measurement of the space we will be reading.
+  const int64_t initial_size = Size(Key(0), Key(n));
+  const int64_t initial_other_size = Size(Key(n), Key(kCount));
+
+  // Read until size drops significantly.
+  std::string limit_key = Key(n);
+  for (int read = 0; true; read++) {
+    ASSERT_LT(read, 100) << "Taking too long to compact";
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst();
+         iter->Valid() && iter->key().ToString() < limit_key;
+         iter->Next()) {
+      // Drop data
+    }
+    delete iter;
+    // Wait a little bit to allow any triggered compactions to complete.
+    Env::Default()->SleepForMicroseconds(1000000);
+    uint64_t size = Size(Key(0), Key(n));
+    fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
+            read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
+    if (size <= initial_size/10) {
+      break;
+    }
+  }
+
+  // Verify that the size of the key space not touched by the reads
+  // is pretty much unchanged.
+  const int64_t final_other_size = Size(Key(n), Key(kCount));
+  ASSERT_LE(final_other_size, initial_other_size + 1048576);
+  ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
+}
+
+TEST(AutoCompactTest, ReadAll) {
+  DoReads(kCount);
+}
+
+TEST(AutoCompactTest, ReadHalf) {
+  DoReads(kCount/2);
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/issues/issue200_test.cc
+++ b/issues/issue200_test.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Test for issue 200: when iterator switches direction from backward
+// to forward, the current key can be yielded unexpectedly if a new
+// mutation has been added just before the current key.
+
+#include "leveldb/db.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class Issue200 { };
+
+TEST(Issue200, Test) {
+  // Get rid of any state from an old run.
+  std::string dbpath = test::TmpDir() + "/leveldb_issue200_test";
+  DestroyDB(dbpath, Options());
+
+  DB *db;
+  Options options;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbpath, &db));
+
+  WriteOptions write_options;
+  ASSERT_OK(db->Put(write_options, "1", "b"));
+  ASSERT_OK(db->Put(write_options, "2", "c"));
+  ASSERT_OK(db->Put(write_options, "3", "d"));
+  ASSERT_OK(db->Put(write_options, "4", "e"));
+  ASSERT_OK(db->Put(write_options, "5", "f"));
+
+  ReadOptions read_options;
+  Iterator *iter = db->NewIterator(read_options);
+
+  // Add an element that should not be reflected in the iterator.
+  ASSERT_OK(db->Put(write_options, "25", "cd"));
+
+  iter->Seek("5");
+  ASSERT_EQ(iter->key().ToString(), "5");
+  iter->Prev();
+  ASSERT_EQ(iter->key().ToString(), "4");
+  iter->Prev();
+  ASSERT_EQ(iter->key().ToString(), "3");
+  iter->Next();
+  ASSERT_EQ(iter->key().ToString(), "4");
+  iter->Next();
+  ASSERT_EQ(iter->key().ToString(), "5");
+
+  delete iter;
+  delete db;
+  DestroyDB(dbpath, options);
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
--- a/src/hyperleveldb/doc/impl.html
+++ b/src/hyperleveldb/doc/impl.html
@@ -11,7 +11,7 @@

 The implementation of leveldb is similar in spirit to the
 representation of a single
-<a href="http://labs.google.com/papers/bigtable.html">
+<a href="http://research.google.com/archive/bigtable.html">
 Bigtable tablet (section 5.3)</a>.
 However the organization of the files that make up the representation
 is somewhat different and is explained below.
--- a/src/leveldb/AUTHORS
+++ b/src/leveldb/AUTHORS
@@ -9,3 +9,4 @@ Sanjay Ghemawat <sanjay@google.com>

 # Partial list of contributors:
 Kevin Regan <kevin.d.regan@gmail.com>
+Johan Bilien <jobi@litl.com>
--- a/src/leveldb/Makefile
+++ b/src/leveldb/Makefile
@@ -31,6 +31,7 @@ TESTHARNESS = ./util/testharness.o $(TESTUTIL)

 TESTS = \
 	arena_test \
+	autocompact_test \
 	bloom_test \
 	c_test \
 	cache_test \
@@ -43,6 +44,7 @@ TESTS = \
 	filename_test \
 	filter_block_test \
 	issue178_test \
+	issue200_test \
 	log_test \
 	memenv_test \
 	skiplist_test \
@@ -70,7 +72,7 @@ SHARED = $(SHARED1)
 else
 # Update db.h if you change these.
 SHARED_MAJOR = 1
-SHARED_MINOR = 12
+SHARED_MINOR = 14
 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
 SHARED2 = $(SHARED1).$(SHARED_MAJOR)
 SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
@@ -114,6 +116,9 @@ leveldbutil: db/leveldb_main.o $(LIBOBJECTS)
 arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)

+autocompact_test: db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) $(LDFLAGS) db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+
 bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) $(LDFLAGS) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)

@@ -150,6 +155,9 @@ filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 issue178_test: issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) $(LDFLAGS) issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)

+issue200_test: issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) $(LDFLAGS) issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+
 log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)

@@ -187,14 +195,14 @@ IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBu
 	mkdir -p ios-x86/$(dir $@)
 	$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
 	mkdir -p ios-arm/$(dir $@)
-	$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+	xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
 	lipo ios-x86/$@ ios-arm/$@ -create -output $@

 .c.o:
 	mkdir -p ios-x86/$(dir $@)
 	$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
 	mkdir -p ios-arm/$(dir $@)
-	$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+	xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
 	lipo ios-x86/$@ ios-arm/$@ -create -output $@

 else
--- a/src/leveldb/db/corruption_test.cc
+++ b/src/leveldb/db/corruption_test.cc
@@ -35,6 +35,7 @@ class CorruptionTest {
  CorruptionTest() {
    tiny_cache_ = NewLRUCache(100);
    options_.env = &env_;
+    options_.block_cache = tiny_cache_;
    dbname_ = test::TmpDir() + "/db_test";
    DestroyDB(dbname_, options_);

@@ -50,17 +51,14 @@ class CorruptionTest {
     delete tiny_cache_;
  }

-  Status TryReopen(Options* options = NULL) {
+  Status TryReopen() {
    delete db_;
    db_ = NULL;
-    Options opt = (options ? *options : options_);
-    opt.env = &env_;
-    opt.block_cache = tiny_cache_;
-    return DB::Open(opt, dbname_, &db_);
+    return DB::Open(options_, dbname_, &db_);
  }

-  void Reopen(Options* options = NULL) {
-    ASSERT_OK(TryReopen(options));
+  void Reopen() {
+    ASSERT_OK(TryReopen());
  }

  void RepairDB() {
@@ -92,6 +90,10 @@ class CorruptionTest {
    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
      uint64_t key;
      Slice in(iter->key());
+      if (in == "" || in == "~") {
+        // Ignore boundary keys.
+        continue;
+      }
      if (!ConsumeDecimalNumber(&in, &key) ||
          !in.empty() ||
          key < next_expected) {
@@ -233,7 +235,7 @@ TEST(CorruptionTest, TableFile) {
  dbi->TEST_CompactRange(1, NULL, NULL);

  Corrupt(kTableFile, 100, 1);
-  Check(99, 99);
+  Check(90, 99);
 }

 TEST(CorruptionTest, TableFileIndexData) {
@@ -299,7 +301,7 @@ TEST(CorruptionTest, CompactionInputError) {
  ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));

  Corrupt(kTableFile, 100, 1);
-  Check(9, 9);
+  Check(5, 9);

  // Force compactions by writing lots of values
  Build(10000);
@@ -307,32 +309,23 @@ TEST(CorruptionTest, CompactionInputError) {
 }

 TEST(CorruptionTest, CompactionInputErrorParanoid) {
-  Options options;
-  options.paranoid_checks = true;
-  options.write_buffer_size = 1048576;
-  Reopen(&options);
+  options_.paranoid_checks = true;
+  options_.write_buffer_size = 512 << 10;
+  Reopen();
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);

-  // Fill levels >= 1 so memtable compaction outputs to level 1
-  for (int level = 1; level < config::kNumLevels; level++) {
-    dbi->Put(WriteOptions(), "", "begin");
-    dbi->Put(WriteOptions(), "~", "end");
+  // Make multiple inputs so we need to compact.
+  for (int i = 0; i < 2; i++) {
+    Build(10);
    dbi->TEST_CompactMemTable();
+    Corrupt(kTableFile, 100, 1);
+    env_.SleepForMicroseconds(100000);
  }
+  dbi->CompactRange(NULL, NULL);

-  Build(10);
-  dbi->TEST_CompactMemTable();
-  ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
-
-  Corrupt(kTableFile, 100, 1);
-  Check(9, 9);
-
-  // Write must eventually fail because of corrupted table
-  Status s;
+  // Write must fail because of corrupted table
  std::string tmp1, tmp2;
-  for (int i = 0; i < 10000 && s.ok(); i++) {
-    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
-  }
+  Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
 }

--- a/src/leveldb/db/db_impl.cc
+++ b/src/leveldb/db/db_impl.cc
@@ -113,14 +113,14 @@ Options SanitizeOptions(const std::string& dbname,
  return result;
 }

-DBImpl::DBImpl(const Options& options, const std::string& dbname)
-    : env_(options.env),
-      internal_comparator_(options.comparator),
-      internal_filter_policy_(options.filter_policy),
-      options_(SanitizeOptions(
-          dbname, &internal_comparator_, &internal_filter_policy_, options)),
-      owns_info_log_(options_.info_log != options.info_log),
-      owns_cache_(options_.block_cache != options.block_cache),
+DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
+    : env_(raw_options.env),
+      internal_comparator_(raw_options.comparator),
+      internal_filter_policy_(raw_options.filter_policy),
+      options_(SanitizeOptions(dbname, &internal_comparator_,
+                               &internal_filter_policy_, raw_options)),
+      owns_info_log_(options_.info_log != raw_options.info_log),
+      owns_cache_(options_.block_cache != raw_options.block_cache),
      dbname_(dbname),
      db_lock_(NULL),
      shutting_down_(NULL),
@@ -130,6 +130,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
      logfile_(NULL),
      logfile_number_(0),
      log_(NULL),
+      seed_(0),
      tmp_batch_(new WriteBatch),
      bg_compaction_scheduled_(false),
      manual_compaction_(NULL),
@@ -138,7 +139,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
  has_imm_.Release_Store(NULL);

  // Reserve ten files or so for other uses and give the rest to TableCache.
-  const int table_cache_size = options.max_open_files - kNumNonTableCacheFiles;
+  const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles;
  table_cache_ = new TableCache(dbname_, &options_, table_cache_size);

  versions_ = new VersionSet(dbname_, &options_, table_cache_,
@@ -1027,7 +1028,8 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
 }  // namespace

 Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
-                                      SequenceNumber* latest_snapshot) {
+                                      SequenceNumber* latest_snapshot,
+                                      uint32_t* seed) {
  IterState* cleanup = new IterState;
  mutex_.Lock();
  *latest_snapshot = versions_->LastSequence();
@@ -1051,13 +1053,15 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
  cleanup->version = versions_->current();
  internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);

+  *seed = ++seed_;
  mutex_.Unlock();
  return internal_iter;
 }

 Iterator* DBImpl::TEST_NewInternalIterator() {
  SequenceNumber ignored;
-  return NewInternalIterator(ReadOptions(), &ignored);
+  uint32_t ignored_seed;
+  return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed);
 }

 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
@@ -1114,12 +1118,21 @@ Status DBImpl::Get(const ReadOptions& options,

 Iterator* DBImpl::NewIterator(const ReadOptions& options) {
  SequenceNumber latest_snapshot;
-  Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
+  uint32_t seed;
+  Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed);
  return NewDBIterator(
-      &dbname_, env_, user_comparator(), internal_iter,
+      this, user_comparator(), iter,
      (options.snapshot != NULL
       ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-       : latest_snapshot));
+       : latest_snapshot),
+      seed);
+}
+
+void DBImpl::RecordReadSample(Slice key) {
+  MutexLock l(&mutex_);
+  if (versions_->current()->RecordReadSample(key)) {
+    MaybeScheduleCompaction();
+  }
 }

 const Snapshot* DBImpl::GetSnapshot() {
--- a/src/leveldb/db/db_impl.h
+++ b/src/leveldb/db/db_impl.h
@@ -59,13 +59,19 @@ class DBImpl : public DB {
  // file at a level >= 1.
  int64_t TEST_MaxNextLevelOverlappingBytes();

+  // Record a sample of bytes read at the specified internal key.
+  // Samples are taken approximately once every config::kReadBytesPeriod
+  // bytes.
+  void RecordReadSample(Slice key);
+
 private:
  friend class DB;
  struct CompactionState;
  struct Writer;

  Iterator* NewInternalIterator(const ReadOptions&,
-                                SequenceNumber* latest_snapshot);
+                                SequenceNumber* latest_snapshot,
+                                uint32_t* seed);

  Status NewDB();

@@ -135,6 +141,7 @@ class DBImpl : public DB {
  WritableFile* logfile_;
  uint64_t logfile_number_;
  log::Writer* log_;
+  uint32_t seed_;                // For sampling.

  // Queue of writers.
  std::deque<Writer*> writers_;
--- a/src/leveldb/db/db_iter.cc
+++ b/src/leveldb/db/db_iter.cc
@@ -5,12 +5,14 @@
 #include "db/db_iter.h"

 #include "db/filename.h"
+#include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "leveldb/env.h"
 #include "leveldb/iterator.h"
 #include "port/port.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
+#include "util/random.h"

 namespace leveldb {

@@ -46,15 +48,16 @@ class DBIter: public Iterator {
    kReverse
  };

-  DBIter(const std::string* dbname, Env* env,
-         const Comparator* cmp, Iterator* iter, SequenceNumber s)
-      : dbname_(dbname),
-        env_(env),
+  DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
+         uint32_t seed)
+      : db_(db),
        user_comparator_(cmp),
        iter_(iter),
        sequence_(s),
        direction_(kForward),
-        valid_(false) {
+        valid_(false),
+        rnd_(seed),
+        bytes_counter_(RandomPeriod()) {
  }
  virtual ~DBIter() {
    delete iter_;
@@ -100,8 +103,12 @@ class DBIter: public Iterator {
    }
  }

-  const std::string* const dbname_;
-  Env* const env_;
+  // Pick next gap with average value of config::kReadBytesPeriod.
+  ssize_t RandomPeriod() {
+    return rnd_.Uniform(2*config::kReadBytesPeriod);
+  }
+
+  DBImpl* db_;
  const Comparator* const user_comparator_;
  Iterator* const iter_;
  SequenceNumber const sequence_;
@@ -112,13 +119,23 @@ class DBIter: public Iterator {
  Direction direction_;
  bool valid_;

+  Random rnd_;
+  ssize_t bytes_counter_;
+
  // No copying allowed
  DBIter(const DBIter&);
  void operator=(const DBIter&);
 };

 inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
-  if (!ParseInternalKey(iter_->key(), ikey)) {
+  Slice k = iter_->key();
+  ssize_t n = k.size() + iter_->value().size();
+  bytes_counter_ -= n;
+  while (bytes_counter_ < 0) {
+    bytes_counter_ += RandomPeriod();
+    db_->RecordReadSample(k);
+  }
+  if (!ParseInternalKey(k, ikey)) {
    status_ = Status::Corruption("corrupted internal key in DBIter");
    return false;
  } else {
@@ -144,12 +161,13 @@ void DBIter::Next() {
      saved_key_.clear();
      return;
    }
+    // saved_key_ already contains the key to skip past.
+  } else {
+    // Store in saved_key_ the current key so we skip it below.
+    SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
  }

-  // Temporarily use saved_key_ as storage for key to skip.
-  std::string* skip = &saved_key_;
-  SaveKey(ExtractUserKey(iter_->key()), skip);
-  FindNextUserEntry(true, skip);
+  FindNextUserEntry(true, &saved_key_);
 }

 void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
@@ -288,12 +306,12 @@ void DBIter::SeekToLast() {
 }  // anonymous namespace

 Iterator* NewDBIterator(
-    const std::string* dbname,
-    Env* env,
+    DBImpl* db,
    const Comparator* user_key_comparator,
    Iterator* internal_iter,
-    const SequenceNumber& sequence) {
-  return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
+    SequenceNumber sequence,
+    uint32_t seed) {
+  return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
 }

 }  // namespace leveldb
--- a/src/leveldb/db/db_iter.h
+++ b/src/leveldb/db/db_iter.h
@@ -11,15 +11,17 @@

 namespace leveldb {

+class DBImpl;
+
 // Return a new iterator that converts internal keys (yielded by
 // "*internal_iter") that were live at the specified "sequence" number
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
-    const std::string* dbname,
-    Env* env,
+    DBImpl* db,
    const Comparator* user_key_comparator,
    Iterator* internal_iter,
-    const SequenceNumber& sequence);
+    SequenceNumber sequence,
+    uint32_t seed);

 }  // namespace leveldb

--- a/src/leveldb/db/db_test.cc
+++ b/src/leveldb/db/db_test.cc
@@ -147,7 +147,7 @@ class SpecialEnv : public EnvWrapper {

    Status s = target()->NewWritableFile(f, r);
    if (s.ok()) {
-      if (strstr(f.c_str(), ".sst") != NULL) {
+      if (strstr(f.c_str(), ".ldb") != NULL) {
        *r = new SSTableFile(this, *r);
      } else if (strstr(f.c_str(), "MANIFEST") != NULL) {
        *r = new ManifestFile(this, *r);
@@ -484,6 +484,24 @@ class DBTest {
    }
    return false;
  }
+
+  // Returns number of files renamed.
+  int RenameLDBToSST() {
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    int files_renamed = 0;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
+        const std::string from = TableFileName(dbname_, number);
+        const std::string to = SSTTableFileName(dbname_, number);
+        ASSERT_OK(env_->RenameFile(from, to));
+        files_renamed++;
+      }
+    }
+    return files_renamed;
+  }
 };

 TEST(DBTest, Empty) {
@@ -1632,6 +1650,22 @@ TEST(DBTest, MissingSSTFile) {
      << s.ToString();
 }

+TEST(DBTest, StillReadSST) {
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_EQ("bar", Get("foo"));
+
+  // Dump the memtable to disk.
+  dbfull()->TEST_CompactMemTable();
+  ASSERT_EQ("bar", Get("foo"));
+  Close();
+  ASSERT_GT(RenameLDBToSST(), 0);
+  Options options = CurrentOptions();
+  options.paranoid_checks = true;
+  Status s = TryReopen(&options);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ("bar", Get("foo"));
+}
+
 TEST(DBTest, FilesDeletedAfterCompaction) {
  ASSERT_OK(Put("foo", "v2"));
  Compact("a", "z");
--- a/src/leveldb/db/dbformat.h
+++ b/src/leveldb/db/dbformat.h
@@ -38,6 +38,9 @@ static const int kL0_StopWritesTrigger = 12;
 // space if the same key space is being repeatedly overwritten.
 static const int kMaxMemCompactLevel = 2;

+// Approximate gap in bytes between samples of data read during iteration.
+static const int kReadBytesPeriod = 1048576;
+
 }  // namespace config

 class InternalKey;
--- a/src/leveldb/db/filename.cc
+++ b/src/leveldb/db/filename.cc
@@ -30,6 +30,11 @@ std::string LogFileName(const std::string& name, uint64_t number) {
 }

 std::string TableFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "ldb");
+}
+
+std::string SSTTableFileName(const std::string& name, uint64_t number) {
  assert(number > 0);
  return MakeFileName(name, number, "sst");
 }
@@ -71,7 +76,7 @@ std::string OldInfoLogFileName(const std::string& dbname) {
 //    dbname/LOG
 //    dbname/LOG.old
 //    dbname/MANIFEST-[0-9]+
-//    dbname/[0-9]+.(log|sst)
+//    dbname/[0-9]+.(log|sst|ldb)
 bool ParseFileName(const std::string& fname,
                   uint64_t* number,
                   FileType* type) {
@@ -106,7 +111,7 @@ bool ParseFileName(const std::string& fname,
    Slice suffix = rest;
    if (suffix == Slice(".log")) {
      *type = kLogFile;
-    } else if (suffix == Slice(".sst")) {
+    } else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) {
      *type = kTableFile;
    } else if (suffix == Slice(".dbtmp")) {
      *type = kTempFile;
--- a/src/leveldb/db/filename.h
+++ b/src/leveldb/db/filename.h
@@ -37,6 +37,11 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number);
 // "dbname".
 extern std::string TableFileName(const std::string& dbname, uint64_t number);

+// Return the legacy file name for an sstable with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+extern std::string SSTTableFileName(const std::string& dbname, uint64_t number);
+
 // Return the name of the descriptor file for the db named by
 // "dbname" and the specified incarnation number.  The result will be
 // prefixed with "dbname".
--- a/src/leveldb/db/filename_test.cc
+++ b/src/leveldb/db/filename_test.cc
@@ -27,6 +27,7 @@ TEST(FileNameTest, Parse) {
    { "100.log",            100,   kLogFile },
    { "0.log",              0,     kLogFile },
    { "0.sst",              0,     kTableFile },
+    { "0.ldb",              0,     kTableFile },
    { "CURRENT",            0,     kCurrentFile },
    { "LOCK",               0,     kDBLockFile },
    { "MANIFEST-2",         2,     kDescriptorFile },
--- a/src/leveldb/db/repair.cc
+++ b/src/leveldb/db/repair.cc
@@ -263,6 +263,12 @@ class Repairer {
    std::string fname = TableFileName(dbname_, t->meta.number);
    int counter = 0;
    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    if (!status.ok()) {
+      fname = SSTTableFileName(dbname_, t->meta.number);
+      Status s2 = env_->GetFileSize(fname, &t->meta.file_size);
+      if (s2.ok())
+        status = Status::OK();
+    }
    if (status.ok()) {
      Iterator* iter = table_cache_->NewIterator(
          ReadOptions(), t->meta.number, t->meta.file_size);
@@ -293,6 +299,8 @@ class Repairer {
      }
      delete iter;
    }
+    // If there was trouble opening an .sst file this will report that the .ldb
+    // file was not found, which is kind of lame but shouldn't happen often.
    Log(options_.info_log, "Table #%llu: %d entries %s",
        (unsigned long long) t->meta.number,
        counter,
--- a/src/leveldb/db/table_cache.cc
+++ b/src/leveldb/db/table_cache.cc
@@ -54,6 +54,12 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
    RandomAccessFile* file = NULL;
    Table* table = NULL;
    s = env_->NewRandomAccessFile(fname, &file);
+    if (!s.ok()) {
+      std::string old_fname = SSTTableFileName(dbname_, file_number);
+      if (env_->NewRandomAccessFile(old_fname, &file).ok()) {
+        s = Status::OK();
+      }
+    }
    if (s.ok()) {
      s = Table::Open(*options_, file, file_size, &table);
    }
--- a/src/leveldb/db/version_set.cc
+++ b/src/leveldb/db/version_set.cc
@@ -289,6 +289,51 @@ static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
  return a->number > b->number;
 }

+void Version::ForEachOverlapping(Slice user_key, Slice internal_key,
+                                 void* arg,
+                                 bool (*func)(void*, int, FileMetaData*)) {
+  // TODO(sanjay): Change Version::Get() to use this function.
+  const Comparator* ucmp = vset_->icmp_.user_comparator();
+
+  // Search level-0 in order from newest to oldest.
+  std::vector<FileMetaData*> tmp;
+  tmp.reserve(files_[0].size());
+  for (uint32_t i = 0; i < files_[0].size(); i++) {
+    FileMetaData* f = files_[0][i];
+    if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
+        ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
+      tmp.push_back(f);
+    }
+  }
+  if (!tmp.empty()) {
+    std::sort(tmp.begin(), tmp.end(), NewestFirst);
+    for (uint32_t i = 0; i < tmp.size(); i++) {
+      if (!(*func)(arg, 0, tmp[i])) {
+        return;
+      }
+    }
+  }
+
+  // Search other levels.
+  for (int level = 1; level < config::kNumLevels; level++) {
+    size_t num_files = files_[level].size();
+    if (num_files == 0) continue;
+
+    // Binary search to find earliest index whose largest key >= internal_key.
+    uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
+    if (index < num_files) {
+      FileMetaData* f = files_[level][index];
+      if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
+        // All of "f" is past any data for user_key
+      } else {
+        if (!(*func)(arg, level, f)) {
+          return;
+        }
+      }
+    }
+  }
+}
+
 Status Version::Get(const ReadOptions& options,
                    const LookupKey& k,
                    std::string* value,
@@ -401,6 +446,44 @@ bool Version::UpdateStats(const GetStats& stats) {
  return false;
 }

+bool Version::RecordReadSample(Slice internal_key) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(internal_key, &ikey)) {
+    return false;
+  }
+
+  struct State {
+    GetStats stats;  // Holds first matching file
+    int matches;
+
+    static bool Match(void* arg, int level, FileMetaData* f) {
+      State* state = reinterpret_cast<State*>(arg);
+      state->matches++;
+      if (state->matches == 1) {
+        // Remember first match.
+        state->stats.seek_file = f;
+        state->stats.seek_file_level = level;
+      }
+      // We can stop iterating once we have a second match.
+      return state->matches < 2;
+    }
+  };
+
+  State state;
+  state.matches = 0;
+  ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
+
+  // Must have at least two matches since we want to merge across
+  // files. But what if we have a single file that contains many
+  // overwrites and deletions?  Should we have another mechanism for
+  // finding such files?
+  if (state.matches >= 2) {
+    // 1MB cost is about 1 seek (see comment in Builder::Apply).
+    return UpdateStats(state.stats);
+  }
+  return false;
+}
+
 void Version::Ref() {
  ++refs_;
 }
@@ -435,10 +518,13 @@ int Version::PickLevelForMemTableOutput(
      if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
        break;
      }
-      GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
-      const int64_t sum = TotalFileSize(overlaps);
-      if (sum > kMaxGrandParentOverlapBytes) {
-        break;
+      if (level + 2 < config::kNumLevels) {
+        // Check that file does not overlap too many grandparent bytes.
+        GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
+        const int64_t sum = TotalFileSize(overlaps);
+        if (sum > kMaxGrandParentOverlapBytes) {
+          break;
+        }
      }
      level++;
    }
@@ -452,6 +538,8 @@ void Version::GetOverlappingInputs(
    const InternalKey* begin,
    const InternalKey* end,
    std::vector<FileMetaData*>* inputs) {
+  assert(level >= 0);
+  assert(level < config::kNumLevels);
  inputs->clear();
  Slice user_begin, user_end;
  if (begin != NULL) {
--- a/src/leveldb/db/version_set.h
+++ b/src/leveldb/db/version_set.h
@@ -78,6 +78,12 @@ class Version {
  // REQUIRES: lock is held
  bool UpdateStats(const GetStats& stats);

+  // Record a sample of bytes read at the specified internal key.
+  // Samples are taken approximately once every config::kReadBytesPeriod
+  // bytes.  Returns true if a new compaction may need to be triggered.
+  // REQUIRES: lock is held
+  bool RecordReadSample(Slice key);
+
  // Reference count management (so Versions do not disappear out from
  // under live iterators)
  void Ref();
@@ -114,6 +120,15 @@ class Version {
  class LevelFileNumIterator;
  Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;

+  // Call func(arg, level, f) for every file that overlaps user_key in
+  // order from newest to oldest.  If an invocation of func returns
+  // false, makes no more calls.
+  //
+  // REQUIRES: user portion of internal_key == user_key.
+  void ForEachOverlapping(Slice user_key, Slice internal_key,
+                          void* arg,
+                          bool (*func)(void*, int, FileMetaData*));
+
  VersionSet* vset_;            // VersionSet to which this Version belongs
  Version* next_;               // Next version in linked list
  Version* prev_;               // Previous version in linked list
--- a/src/leveldb/include/leveldb/db.h
+++ b/src/leveldb/include/leveldb/db.h
@@ -14,7 +14,7 @@ namespace leveldb {

 // Update Makefile if you change these
 static const int kMajorVersion = 1;
-static const int kMinorVersion = 12;
+static const int kMinorVersion = 14;

 struct Options;
 struct ReadOptions;
--- a/src/leveldb/util/arena.cc
+++ b/src/leveldb/util/arena.cc
@@ -40,7 +40,7 @@ char* Arena::AllocateFallback(size_t bytes) {
 }

 char* Arena::AllocateAligned(size_t bytes) {
-  const int align = sizeof(void*);    // We'll align to pointer size
+  const int align = (sizeof(void*) > 8) ? sizeof(void*) : 8;
  assert((align & (align-1)) == 0);   // Pointer size should be a power of 2
  size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1);
  size_t slop = (current_mod == 0 ? 0 : align - current_mod);
--- a/src/leveldb/util/env_posix.cc
+++ b/src/leveldb/util/env_posix.cc
@@ -320,8 +320,39 @@ class PosixMmapFile : public WritableFile {
    return Status::OK();
  }

-  virtual Status Sync() {
+  Status SyncDirIfManifest() {
+    const char* f = filename_.c_str();
+    const char* sep = strrchr(f, '/');
+    Slice basename;
+    std::string dir;
+    if (sep == NULL) {
+      dir = ".";
+      basename = f;
+    } else {
+      dir = std::string(f, sep - f);
+      basename = sep + 1;
+    }
    Status s;
+    if (basename.starts_with("MANIFEST")) {
+      int fd = open(dir.c_str(), O_RDONLY);
+      if (fd < 0) {
+        s = IOError(dir, errno);
+      } else {
+        if (fsync(fd) < 0) {
+          s = IOError(dir, errno);
+        }
+        close(fd);
+      }
+    }
+    return s;
+  }
+
+  virtual Status Sync() {
+    // Ensure new files referred to by the manifest are in the filesystem.
+    Status s = SyncDirIfManifest();
+    if (!s.ok()) {
+      return s;
+    }

    if (pending_sync_) {
      // Some unmapped data was not synced
--- a/src/leveldb/util/random.h
+++ b/src/leveldb/util/random.h
@@ -16,7 +16,12 @@ class Random {
 private:
  uint32_t seed_;
 public:
-  explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { }
+  explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) {
+    // Avoid bad seeds.
+    if (seed_ == 0 || seed_ == 2147483647L) {
+      seed_ = 1;
+    }
+  }
  uint32_t Next() {
    static const uint32_t M = 2147483647L;   // 2^31-1
    static const uint64_t A = 16807;  // bits 14, 8, 7, 5, 2, 1, 0