Merge branch 'master' into columnfamilies

Conflicts: HISTORY.md db/db_impl.cc db/db_impl.h db/db_iter.cc db/db_test.cc db/dbformat.h db/memtable.cc db/memtable_list.cc db/memtable_list.h db/table_cache.cc db/table_cache.h db/version_edit.h db/version_set.cc db/version_set.h db/write_batch.cc db/write_batch_test.cc include/rocksdb/options.h util/options.cc
2025-12-06 17:27:55 +00:00 · 2014-02-06 15:42:16 -08:00
parent 0b4ccf765c 4564b2e8f9
commit 0143abdbb0
104 changed files with 6225 additions and 2358 deletions
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -17,15 +17,17 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
-#include "rocksdb/table.h"
+#include "table/table_builder.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
+#include "db/dbformat.h"
 #include "table/block_based_table_reader.h"
 #include "table/block.h"
 #include "table/block_builder.h"
 #include "table/filter_block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
@@ -34,51 +36,24 @@ namespace rocksdb {

 namespace {

-struct BytewiseLessThan {
-  bool operator()(const std::string& key1, const std::string& key2) const {
-    // smaller entries will be placed in front.
-    return comparator->Compare(key1, key2) <= 0;
-  }
-  const Comparator* comparator = BytewiseComparator();
-};
-
-// When writing to a block that requires entries to be sorted by
-// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
-// before writng to store.
-typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
-
-void AddProperties(BytewiseSortedMap& props, std::string name, uint64_t val) {
-  assert(props.find(name) == props.end());
-
-  std::string dst;
-  PutVarint64(&dst, val);
-
-  props.insert(
-      std::make_pair(name, dst)
-  );
-}
-
 static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
  // Check to see if compressed less than 12.5%
  return compressed_size < raw_size - (raw_size / 8u);
 }

-// Were we encounter any error occurs during user-defined statistics collection,
-// we'll write the warning message to info log.
-void LogPropertiesCollectionError(
-    Logger* info_log, const std::string& method, const std::string& name) {
-  assert(method == "Add" || method == "Finish");
-
-  std::string msg =
-    "[Warning] encountered error when calling TablePropertiesCollector::" +
-    method + "() with collector name: " + name;
-  Log(info_log, "%s", msg.c_str());
-}
-
 }  // anonymous namespace

+// kBlockBasedTableMagicNumber was picked by running
+//    echo http://code.google.com/p/leveldb/ | sha1sum
+// and taking the leading 64 bits.
+// Please note that kBlockBasedTableMagicNumber may also be accessed by
+// other .cc files so it have to be explicitly declared with "extern".
+extern const uint64_t kBlockBasedTableMagicNumber
+    = 0xdb4775248b80fb57ull;
+
 struct BlockBasedTableBuilder::Rep {
  Options options;
+  const InternalKeyComparator& internal_comparator;
  WritableFile* file;
  uint64_t offset = 0;
  Status status;
@@ -98,31 +73,30 @@ struct BlockBasedTableBuilder::Rep {
  std::string compressed_output;
  std::unique_ptr<FlushBlockPolicy> flush_block_policy;

-  Rep(const Options& opt,
-      WritableFile* f,
-      FlushBlockPolicyFactory* flush_block_policy_factory,
+  Rep(const Options& opt, const InternalKeyComparator& icomparator,
+      WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory,
      CompressionType compression_type)
      : options(opt),
+        internal_comparator(icomparator),
        file(f),
-        data_block(options),
+        data_block(options, &internal_comparator),
        // To avoid linear scan, we make the block_restart_interval to be `1`
        // in index block builder
-        index_block(1 /* block_restart_interval */, options.comparator),
+        index_block(1 /* block_restart_interval */, &internal_comparator),
        compression_type(compression_type),
-        filter_block(opt.filter_policy == nullptr ? nullptr
-                     : new FilterBlockBuilder(opt)),
+        filter_block(opt.filter_policy == nullptr
+                         ? nullptr
+                         : new FilterBlockBuilder(opt, &internal_comparator)),
        flush_block_policy(
-            flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {
-  }
+            flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {}
 };

 BlockBasedTableBuilder::BlockBasedTableBuilder(
-    const Options& options,
-    WritableFile* file,
-    FlushBlockPolicyFactory* flush_block_policy_factory,
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, FlushBlockPolicyFactory* flush_block_policy_factory,
    CompressionType compression_type)
-    : rep_(new Rep(options,
-                   file, flush_block_policy_factory, compression_type)) {
+    : rep_(new Rep(options, internal_comparator, file,
+                   flush_block_policy_factory, compression_type)) {
  if (rep_->filter_block != nullptr) {
    rep_->filter_block->StartBlock(0);
  }
@@ -145,7 +119,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
  assert(!r->closed);
  if (!ok()) return;
  if (r->props.num_entries > 0) {
-    assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0);
+    assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
  }

  auto should_flush = r->flush_block_policy->Update(key, value);
@@ -162,7 +136,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
    // entries in the first block and < all entries in subsequent
    // blocks.
    if (ok()) {
-      r->options.comparator->FindShortestSeparator(&r->last_key, key);
+      r->internal_comparator.FindShortestSeparator(&r->last_key, key);
      std::string handle_encoding;
      r->pending_handle.EncodeTo(&handle_encoding);
      r->index_block.Add(r->last_key, Slice(handle_encoding));
@@ -179,16 +153,12 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
  r->props.raw_key_size += key.size();
  r->props.raw_value_size += value.size();

-  for (auto collector : r->options.table_properties_collectors) {
-    Status s = collector->Add(key, value);
-    if (!s.ok()) {
-      LogPropertiesCollectionError(
-          r->options.info_log.get(),
-          "Add", /* method */
-          collector->Name()
-      );
-    }
-  }
+  NotifyCollectTableCollectorsOnAdd(
+      key,
+      value,
+      r->options.table_properties_collectors,
+      r->options.info_log.get()
+  );
 }

 void BlockBasedTableBuilder::Flush() {
@@ -370,7 +340,7 @@ Status BlockBasedTableBuilder::Finish() {
  // block, we will finish writing all index entries here and flush them
  // to storage after metaindex block is written.
  if (ok() && !empty_data_block) {
-    r->options.comparator->FindShortSuccessor(&r->last_key);
+    r->internal_comparator.FindShortSuccessor(&r->last_key);

    std::string handle_encoding;
    r->pending_handle.EncodeTo(&handle_encoding);
@@ -382,14 +352,7 @@ Status BlockBasedTableBuilder::Finish() {
  //    2. [meta block: properties]
  //    3. [metaindex block]
  if (ok()) {
-    // We use `BytewiseComparator` as the comparator for meta block.
-    BlockBuilder meta_index_block(
-        r->options.block_restart_interval,
-        BytewiseComparator()
-    );
-    // Key: meta block name
-    // Value: block handle to that meta block
-    BytewiseSortedMap meta_block_handles;
+    MetaIndexBuilder meta_index_builer;

    // Write filter block.
    if (r->filter_block != nullptr) {
@@ -397,104 +360,43 @@ Status BlockBasedTableBuilder::Finish() {
      // of filter data.
      std::string key = BlockBasedTable::kFilterBlockPrefix;
      key.append(r->options.filter_policy->Name());
-      std::string handle_encoding;
-      filter_block_handle.EncodeTo(&handle_encoding);
-      meta_block_handles.insert(
-          std::make_pair(key, handle_encoding)
-      );
+      meta_index_builer.Add(key, filter_block_handle);
    }

    // Write properties block.
    {
-      BlockBuilder properties_block(
-          r->options.block_restart_interval,
-          BytewiseComparator()
-      );
-
-      BytewiseSortedMap properties;
-
-      // Add basic properties
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kRawKeySize,
-          r->props.raw_key_size
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kRawValueSize,
-          r->props.raw_value_size
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kDataSize,
-          r->props.data_size
-      );
+      PropertyBlockBuilder property_block_builder;
+      std::vector<std::string> failed_user_prop_collectors;
+      r->props.filter_policy_name = r->options.filter_policy != nullptr ?
+          r->options.filter_policy->Name() : "";
      r->props.index_size =
        r->index_block.CurrentSizeEstimate() + kBlockTrailerSize;
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kIndexSize,
-          r->props.index_size
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kNumEntries,
-          r->props.num_entries
-      );
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kNumDataBlocks,
-          r->props.num_data_blocks);
-      if (r->filter_block != nullptr) {
-        properties.insert({
-              BlockBasedTablePropertiesNames::kFilterPolicy,
-              r->options.filter_policy->Name()
-        });
-      }
-      AddProperties(
-          properties,
-          BlockBasedTablePropertiesNames::kFilterSize,
-          r->props.filter_size
-      );

-      for (auto collector : r->options.table_properties_collectors) {
-        TableProperties::UserCollectedProperties user_collected_properties;
-        Status s =
-          collector->Finish(&user_collected_properties);
+      // Add basic properties
+      property_block_builder.AddTableProperty(r->props);

-        if (!s.ok()) {
-          LogPropertiesCollectionError(
-              r->options.info_log.get(),
-              "Finish", /* method */
-              collector->Name()
-          );
-        } else {
-          properties.insert(
-              user_collected_properties.begin(),
-              user_collected_properties.end()
-          );
-        }
-      }
-
-      for (const auto& stat : properties) {
-        properties_block.Add(stat.first, stat.second);
-      }
+      NotifyCollectTableCollectorsOnFinish(
+          r->options.table_properties_collectors,
+          r->options.info_log.get(),
+          &property_block_builder
+      );

      BlockHandle properties_block_handle;
-      WriteBlock(&properties_block, &properties_block_handle);
-
-      std::string handle_encoding;
-      properties_block_handle.EncodeTo(&handle_encoding);
-      meta_block_handles.insert(
-          { BlockBasedTable::kPropertiesBlock, handle_encoding }
+      WriteRawBlock(
+          property_block_builder.Finish(),
+          kNoCompression,
+          &properties_block_handle
      );
+
+      meta_index_builer.Add(kPropertiesBlock,
+                            properties_block_handle);
    }  // end of properties block writing

-    for (const auto& metablock : meta_block_handles) {
-      meta_index_block.Add(metablock.first, metablock.second);
-    }
-
-    WriteBlock(&meta_index_block, &metaindex_block_handle);
+    WriteRawBlock(
+        meta_index_builer.Finish(),
+        kNoCompression,
+        &metaindex_block_handle
+    );
  }  // meta blocks and metaindex block.

  // Write index block
@@ -504,7 +406,7 @@ Status BlockBasedTableBuilder::Finish() {

  // Write footer
  if (ok()) {
-    Footer footer;
+    Footer footer(kBlockBasedTableMagicNumber);
    footer.set_metaindex_handle(metaindex_block_handle);
    footer.set_index_handle(index_block_handle);
    std::string footer_encoding;
@@ -556,4 +458,7 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
  return rep_->offset;
 }

+const std::string BlockBasedTable::kFilterBlockPrefix =
+    "filter.";
+
 }  // namespace rocksdb
--- a/table/block_based_table_builder.h
+++ b/table/block_based_table_builder.h
@@ -12,7 +12,7 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
-#include "rocksdb/table.h"
+#include "table/table_builder.h"

 namespace rocksdb {

@@ -20,13 +20,13 @@ class BlockBuilder;
 class BlockHandle;
 class WritableFile;

-
 class BlockBasedTableBuilder : public TableBuilder {
 public:
  // Create a builder that will store the contents of the table it is
  // building in *file.  Does not close the file.  It is up to the
  // caller to close the file after calling Finish().
  BlockBasedTableBuilder(const Options& options,
+                         const InternalKeyComparator& internal_comparator,
                         WritableFile* file,
                         FlushBlockPolicyFactory* flush_block_policy_factory,
                         CompressionType compression_type);
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -18,17 +18,19 @@

 namespace rocksdb {

-Status BlockBasedTableFactory::GetTableReader(
+Status BlockBasedTableFactory::NewTableReader(
    const Options& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator,
    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
    unique_ptr<TableReader>* table_reader) const {
  return BlockBasedTable::Open(options, soptions, table_options_,
-                               std::move(file), file_size, table_reader);
+                               internal_comparator, std::move(file), file_size,
+                               table_reader);
 }

-TableBuilder* BlockBasedTableFactory::GetTableBuilder(
-    const Options& options, WritableFile* file,
-    CompressionType compression_type) const {
+TableBuilder* BlockBasedTableFactory::NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type) const {
  auto flush_block_policy_factory = 
    table_options_.flush_block_policy_factory.get();

@@ -45,11 +47,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder(
                                          options.block_size_deviation);
  }

-  auto table_builder =  new BlockBasedTableBuilder(
-      options,
-      file,
-      flush_block_policy_factory,
-      compression_type);
+  auto table_builder =
+      new BlockBasedTableBuilder(options, internal_comparator, file,
+                                 flush_block_policy_factory, compression_type);

  // Delete flush_block_policy_factory only when it's just created from the
  // options.
@@ -63,4 +63,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder(
  return table_builder;
 }

+TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options) {
+  return new BlockBasedTableFactory(table_options);
+}
+
 }  // namespace rocksdb
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -14,7 +14,6 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_options.h"

 namespace rocksdb {

@@ -22,31 +21,26 @@ struct Options;
 struct EnvOptions;

 using std::unique_ptr;
-class Status;
-class RandomAccessFile;
-class WritableFile;
-class Table;
-class TableBuilder;
-class BlockBasedTable;
 class BlockBasedTableBuilder;

-class BlockBasedTableFactory: public TableFactory {
+class BlockBasedTableFactory : public TableFactory {
 public:
-  BlockBasedTableFactory() : BlockBasedTableFactory(BlockBasedTableOptions()) {}
-  explicit BlockBasedTableFactory(const BlockBasedTableOptions& table_options)
+  explicit BlockBasedTableFactory(
+      const BlockBasedTableOptions& table_options = BlockBasedTableOptions())
      : table_options_(table_options) {}

  ~BlockBasedTableFactory() {}

  const char* Name() const override { return "BlockBasedTable"; }

-  Status GetTableReader(const Options& options, const EnvOptions& soptions,
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                        unique_ptr<TableReader>* table_reader) const override;

-  TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
-                                CompressionType compression_type)
-      const override;
+  TableBuilder* NewTableBuilder(
+      const Options& options, const InternalKeyComparator& internal_comparator,
+      WritableFile* file, CompressionType compression_type) const override;

 private:
  BlockBasedTableOptions table_options_;
--- a/table/block_based_table_options.h
+++ b/table/block_based_table_options.h
@@ -1,31 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#pragma once
-#include <memory>
-
-namespace rocksdb {
-
-class FlushBlockPolicyFactory;
-
-struct BlockBasedTableOptions {
-  // @flush_block_policy_factory creates the instances of flush block policy.
-  // which provides a configurable way to determine when to flush a block in
-  // the block based tables.  If not set, table builder will use the default
-  // block flush policy, which cut blocks by block size (please refer to
-  // `FlushBlockBySizePolicy`).
-  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
-
-  // TODO(kailiu) Temporarily disable this feature by making the default value
-  // to be false. Also in master branch, this file is non-public so no user
-  // will be able to change the value of `cache_index_and_filter_blocks`.
-  //
-  // Indicating if we'd put index/filter blocks to the block cache.
-  // If not specified, each "table reader" object will pre-load index/filter
-  // block during table initialization.
-  bool cache_index_and_filter_blocks = false;
-};
-
-}  // namespace rocksdb
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -21,15 +21,17 @@
 #include "table/block.h"
 #include "table/filter_block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"

 #include "util/coding.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
-#include "table/block_based_table_options.h"

 namespace rocksdb {

+extern uint64_t kBlockBasedTableMagicNumber;
+
 // The longest the prefix of the cache key used to identify blocks can be.
 // We are using the fact that we know for Posix files the unique ID is three
 // varints.
@@ -37,12 +39,13 @@ const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
 using std::unique_ptr;

 struct BlockBasedTable::Rep {
-  Rep(const EnvOptions& storage_options) :
-    soptions(storage_options) {
-  }
+  Rep(const EnvOptions& storage_options,
+      const InternalKeyComparator& internal_comparator)
+      : soptions(storage_options), internal_comparator_(internal_comparator) {}

  Options options;
  const EnvOptions& soptions;
+  const InternalKeyComparator& internal_comparator_;
  Status status;
  unique_ptr<RandomAccessFile> file;
  char cache_key_prefix[kMaxCacheKeyPrefixSize];
@@ -223,34 +226,19 @@ Cache::Handle* GetFromBlockCache(

 Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
                             const BlockBasedTableOptions& table_options,
+                             const InternalKeyComparator& internal_comparator,
                             unique_ptr<RandomAccessFile>&& file,
                             uint64_t file_size,
                             unique_ptr<TableReader>* table_reader) {
  table_reader->reset();

-  if (file_size < Footer::kEncodedLength) {
-    return Status::InvalidArgument("file is too short to be an sstable");
-  }
-
-  char footer_space[Footer::kEncodedLength];
-  Slice footer_input;
-  Status s = file->Read(file_size - Footer::kEncodedLength,
-                        Footer::kEncodedLength, &footer_input, footer_space);
-  if (!s.ok()) return s;
-
-  // Check that we actually read the whole footer from the file. It may be
-  // that size isn't correct.
-  if (footer_input.size() != Footer::kEncodedLength) {
-    return Status::InvalidArgument("file is too short to be an sstable");
-  }
-
-  Footer footer;
-  s = footer.DecodeFrom(&footer_input);
+  Footer footer(kBlockBasedTableMagicNumber);
+  auto s = ReadFooterFromFile(file.get(), file_size, &footer);
  if (!s.ok()) return s;

  // We've successfully read the footer and the index block: we're
  // ready to serve requests.
-  Rep* rep = new BlockBasedTable::Rep(soptions);
+  Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator);
  rep->options = options;
  rep->file = std::move(file);
  rep->metaindex_handle = footer.metaindex_handle();
@@ -265,10 +253,11 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,

  // Read the properties
  meta_iter->Seek(kPropertiesBlock);
-  if (meta_iter->Valid() && meta_iter->key() == Slice(kPropertiesBlock)) {
+  if (meta_iter->Valid() && meta_iter->key() == kPropertiesBlock) {
    s = meta_iter->status();
    if (s.ok()) {
-      s = ReadProperties(meta_iter->value(), rep, &rep->table_properties);
+      s = ReadProperties(meta_iter->value(), rep->file.get(), rep->options.env,
+                         rep->options.info_log.get(), &rep->table_properties);
    }

    if (!s.ok()) {
@@ -350,7 +339,7 @@ void BlockBasedTable::SetupForCompaction() {
  compaction_optimized_ = true;
 }

-TableProperties& BlockBasedTable::GetTableProperties() {
+const TableProperties& BlockBasedTable::GetTableProperties() {
  return rep_->table_properties;
 }

@@ -415,96 +404,6 @@ FilterBlockReader* BlockBasedTable::ReadFilter (
       rep->options, block.data, block.heap_allocated);
 }

-Status BlockBasedTable::ReadProperties(
-    const Slice& handle_value, Rep* rep, TableProperties* table_properties) {
-  assert(table_properties);
-
-  Slice v = handle_value;
-  BlockHandle handle;
-  if (!handle.DecodeFrom(&v).ok()) {
-    return Status::InvalidArgument("Failed to decode properties block handle");
-  }
-
-  BlockContents block_contents;
-  Status s = ReadBlockContents(
-      rep->file.get(),
-      ReadOptions(),
-      handle,
-      &block_contents,
-      rep->options.env,
-      false
-  );
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  Block properties_block(block_contents);
-  std::unique_ptr<Iterator> iter(
-      properties_block.NewIterator(BytewiseComparator())
-  );
-
-  // All pre-defined properties of type uint64_t
-  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
-    { BlockBasedTablePropertiesNames::kDataSize,
-      &table_properties->data_size },
-    { BlockBasedTablePropertiesNames::kIndexSize,
-      &table_properties->index_size },
-    { BlockBasedTablePropertiesNames::kFilterSize,
-      &table_properties->filter_size },
-    { BlockBasedTablePropertiesNames::kRawKeySize,
-      &table_properties->raw_key_size },
-    { BlockBasedTablePropertiesNames::kRawValueSize,
-      &table_properties->raw_value_size },
-    { BlockBasedTablePropertiesNames::kNumDataBlocks,
-      &table_properties->num_data_blocks },
-    { BlockBasedTablePropertiesNames::kNumEntries,
-      &table_properties->num_entries },
-  };
-
-  std::string last_key;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    s = iter->status();
-    if (!s.ok()) {
-      break;
-    }
-
-    auto key = iter->key().ToString();
-    // properties block is strictly sorted with no duplicate key.
-    assert(
-        last_key.empty() ||
-        BytewiseComparator()->Compare(key, last_key) > 0
-    );
-    last_key = key;
-
-    auto raw_val = iter->value();
-    auto pos = predefined_uint64_properties.find(key);
-
-    if (pos != predefined_uint64_properties.end()) {
-      // handle predefined rocksdb properties
-      uint64_t val;
-      if (!GetVarint64(&raw_val, &val)) {
-        // skip malformed value
-        auto error_msg =
-          "[Warning] detect malformed value in properties meta-block:"
-          "\tkey: " + key + "\tval: " + raw_val.ToString();
-        Log(rep->options.info_log, "%s", error_msg.c_str());
-        continue;
-      }
-      *(pos->second) = val;
-    } else if (key == BlockBasedTablePropertiesNames::kFilterPolicy) {
-      table_properties->filter_policy_name = raw_val.ToString();
-    } else {
-      // handle user-collected
-      table_properties->user_collected_properties.insert(
-          std::make_pair(key, raw_val.ToString())
-      );
-    }
-  }
-
-  return s;
-}
-
 Status BlockBasedTable::GetBlock(
    const BlockBasedTable* table,
    const BlockHandle& handle,
@@ -764,7 +663,7 @@ Iterator* BlockBasedTable::BlockReader(void* arg,

  Iterator* iter;
  if (block != nullptr) {
-    iter = block->NewIterator(table->rep_->options.comparator);
+    iter = block->NewIterator(&(table->rep_->internal_comparator_));
    if (cache_handle != nullptr) {
      iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
    } else {
@@ -837,7 +736,7 @@ BlockBasedTable::GetFilter(bool no_io) const {
 // Get the iterator from the index block.
 Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
  if (rep_->index_block) {
-    return rep_->index_block->NewIterator(rep_->options.comparator);
+    return rep_->index_block->NewIterator(&(rep_->internal_comparator_));
  }

  // get index block from cache
@@ -858,7 +757,7 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {

  Iterator* iter;
  if (entry.value != nullptr) {
-    iter = entry.value->NewIterator(rep_->options.comparator);
+    iter = entry.value->NewIterator(&(rep_->internal_comparator_));
    if (entry.cache_handle) {
      iter->RegisterCleanup(
          &ReleaseBlock, rep_->options.block_cache.get(), entry.cache_handle
@@ -872,9 +771,9 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
  return iter;
 }

-Iterator* BlockBasedTable::BlockReader(void* arg,
-                                       const ReadOptions& options,
+Iterator* BlockBasedTable::BlockReader(void* arg, const ReadOptions& options,
                                       const EnvOptions& soptions,
+                                       const InternalKeyComparator& icomparator,
                                       const Slice& index_value,
                                       bool for_compaction) {
  return BlockReader(arg, options, index_value, nullptr, for_compaction);
@@ -965,20 +864,15 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) {
    }
  }

-  return NewTwoLevelIterator(
-           IndexBlockReader(options),
-           &BlockBasedTable::BlockReader,
-           const_cast<BlockBasedTable*>(this),
-           options,
-           rep_->soptions
-         );
+  return NewTwoLevelIterator(IndexBlockReader(options),
+                             &BlockBasedTable::BlockReader,
+                             const_cast<BlockBasedTable*>(this), options,
+                             rep_->soptions, rep_->internal_comparator_);
 }

 Status BlockBasedTable::Get(
-    const ReadOptions& readOptions,
-    const Slice& key,
-    void* handle_context,
-    bool (*result_handler)(void* handle_context, const Slice& k,
+    const ReadOptions& readOptions, const Slice& key, void* handle_context,
+    bool (*result_handler)(void* handle_context, const ParsedInternalKey& k,
                           const Slice& v, bool didIO),
    void (*mark_key_may_exist_handler)(void* handle_context)) {
  Status s;
@@ -1016,8 +910,13 @@ Status BlockBasedTable::Get(

      // Call the *saver function on each entry/block until it returns false
      for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) {
-        if (!(*result_handler)(handle_context, block_iter->key(),
-                               block_iter->value(), didIO)) {
+        ParsedInternalKey parsed_key;
+        if (!ParseInternalKey(block_iter->key(), &parsed_key)) {
+          s = Status::Corruption(Slice());
+        }
+
+        if (!(*result_handler)(handle_context, parsed_key, block_iter->value(),
+                               didIO)) {
          done = true;
          break;
        }
@@ -1034,7 +933,8 @@ Status BlockBasedTable::Get(
  return s;
 }

-bool SaveDidIO(void* arg, const Slice& key, const Slice& value, bool didIO) {
+bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value,
+               bool didIO) {
  *reinterpret_cast<bool*>(arg) = didIO;
  return false;
 }
@@ -1075,25 +975,4 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
  return result;
 }

-const std::string BlockBasedTable::kFilterBlockPrefix =
-    "filter.";
-const std::string BlockBasedTable::kPropertiesBlock =
-    "rocksdb.properties";
-const std::string BlockBasedTablePropertiesNames::kDataSize  =
-    "rocksdb.data.size";
-const std::string BlockBasedTablePropertiesNames::kIndexSize =
-    "rocksdb.index.size";
-const std::string BlockBasedTablePropertiesNames::kFilterSize =
-    "rocksdb.filter.size";
-const std::string BlockBasedTablePropertiesNames::kRawKeySize =
-    "rocksdb.raw.key.size";
-const std::string BlockBasedTablePropertiesNames::kRawValueSize =
-    "rocksdb.raw.value.size";
-const std::string BlockBasedTablePropertiesNames::kNumDataBlocks =
-    "rocksdb.num.data.blocks";
-const std::string BlockBasedTablePropertiesNames::kNumEntries =
-    "rocksdb.num.entries";
-const std::string BlockBasedTablePropertiesNames::kFilterPolicy =
-    "rocksdb.filter.policy";
-
 }  // namespace rocksdb
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -14,8 +14,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/statistics.h"
-#include "rocksdb/table_properties.h"
-#include "rocksdb/table.h"
+#include "table/table_reader.h"
 #include "util/coding.h"

 namespace rocksdb {
@@ -39,7 +38,6 @@ using std::unique_ptr;
 class BlockBasedTable : public TableReader {
 public:
  static const std::string kFilterBlockPrefix;
-  static const std::string kPropertiesBlock;

  // Attempt to open the table that is stored in bytes [0..file_size)
  // of "file", and read the metadata entries necessary to allow
@@ -53,6 +51,7 @@ class BlockBasedTable : public TableReader {
  // *file must remain live while this Table is in use.
  static Status Open(const Options& db_options, const EnvOptions& env_options,
                     const BlockBasedTableOptions& table_options,
+                     const InternalKeyComparator& internal_key_comparator,
                     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                     unique_ptr<TableReader>* table_reader);

@@ -63,14 +62,13 @@ class BlockBasedTable : public TableReader {
  // call one of the Seek methods on the iterator before using it).
  Iterator* NewIterator(const ReadOptions&) override;

-  Status Get(
-        const ReadOptions& readOptions,
-        const Slice& key,
-        void* handle_context,
-        bool (*result_handler)(void* handle_context, const Slice& k,
-                               const Slice& v, bool didIO),
-        void (*mark_key_may_exist_handler)(void* handle_context) = nullptr)
-    override;
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             void* handle_context,
+             bool (*result_handler)(void* handle_context,
+                                    const ParsedInternalKey& k, const Slice& v,
+                                    bool didIO),
+             void (*mark_key_may_exist_handler)(void* handle_context) =
+                 nullptr) override;

  // Given a key, return an approximate byte offset in the file where
  // the data for that key begins (or would begin if the key were
@@ -82,13 +80,13 @@ class BlockBasedTable : public TableReader {

  // Returns true if the block for the specified key is in cache.
  // REQUIRES: key is in this table.
-  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);

  // Set up the table for Compaction. Might change some parameters with
  // posix_fadvise
  void SetupForCompaction() override;

-  TableProperties& GetTableProperties() override;
+  const TableProperties& GetTableProperties() override;

  ~BlockBasedTable();

@@ -101,8 +99,9 @@ class BlockBasedTable : public TableReader {
  bool compaction_optimized_;

  static Iterator* BlockReader(void*, const ReadOptions&,
-                               const EnvOptions& soptions, const Slice&,
-                               bool for_compaction);
+                               const EnvOptions& soptions,
+                               const InternalKeyComparator& icomparator,
+                               const Slice&, bool for_compaction);

  static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
                               bool* didIO, bool for_compaction = false);
@@ -142,7 +141,6 @@ class BlockBasedTable : public TableReader {

  void ReadMeta(const Footer& footer);
  void ReadFilter(const Slice& filter_handle_value);
-  static Status ReadProperties(const Slice& handle_value, Rep* rep);

  // Read the meta block from sst.
  static Status ReadMetaBlock(
@@ -156,10 +154,6 @@ class BlockBasedTable : public TableReader {
      Rep* rep,
      size_t* filter_size = nullptr);

-  // Read the table properties from properties block.
-  static Status ReadProperties(
-      const Slice& handle_value, Rep* rep, TableProperties* properties);
-
  static void SetupCacheKeyPrefix(Rep* rep);

  explicit BlockBasedTable(Rep* rep) :
@@ -181,15 +175,4 @@ class BlockBasedTable : public TableReader {
  void operator=(const TableReader&) = delete;
 };

-struct BlockBasedTablePropertiesNames {
-  static const std::string kDataSize;
-  static const std::string kIndexSize;
-  static const std::string kFilterSize;
-  static const std::string kRawKeySize;
-  static const std::string kRawValueSize;
-  static const std::string kNumDataBlocks;
-  static const std::string kNumEntries;
-  static const std::string kFilterPolicy;
-};
-
 }  // namespace rocksdb
--- a/table/block_builder.cc
+++ b/table/block_builder.cc
@@ -36,6 +36,7 @@
 #include <algorithm>
 #include <assert.h>
 #include "rocksdb/comparator.h"
+#include "db/dbformat.h"
 #include "util/coding.h"

 namespace rocksdb {
@@ -51,9 +52,8 @@ BlockBuilder::BlockBuilder(int block_restart_interval,
  restarts_.push_back(0);       // First restart point is at offset 0
 }

-BlockBuilder::BlockBuilder(const Options& options)
-    : BlockBuilder(options.block_restart_interval, options.comparator) {
-}
+BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator)
+    : BlockBuilder(options.block_restart_interval, comparator) {}

 void BlockBuilder::Reset() {
  buffer_.clear();
--- a/table/block_builder.h
+++ b/table/block_builder.h
@@ -21,7 +21,7 @@ class Comparator;
 class BlockBuilder {
 public:
  BlockBuilder(int block_builder, const Comparator* comparator);
-  explicit BlockBuilder(const Options& options);
+  explicit BlockBuilder(const Options& options, const Comparator* comparator);

  // Reset the contents as if the BlockBuilder was just constructed.
  void Reset();
--- a/table/block_test.cc
+++ b/table/block_test.cc
@@ -32,9 +32,12 @@ class BlockTest {};
 TEST(BlockTest, SimpleTest) {
  Random rnd(301);
  Options options = Options();
+  std::unique_ptr<InternalKeyComparator> ic;
+  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
+
  std::vector<std::string> keys;
  std::vector<std::string> values;
-  BlockBuilder builder(options);
+  BlockBuilder builder(options, ic.get());
  int num_records = 100000;
  char buf[10];
  char* p = &buf[0];
--- a/table/filter_block.cc
+++ b/table/filter_block.cc
@@ -21,11 +21,12 @@ namespace rocksdb {
 static const size_t kFilterBaseLg = 11;
 static const size_t kFilterBase = 1 << kFilterBaseLg;

-FilterBlockBuilder::FilterBlockBuilder(const Options& opt)
-                 : policy_(opt.filter_policy),
-                   prefix_extractor_(opt.prefix_extractor),
-                   whole_key_filtering_(opt.whole_key_filtering),
-                   comparator_(opt.comparator){}
+FilterBlockBuilder::FilterBlockBuilder(const Options& opt,
+                                       const Comparator* internal_comparator)
+    : policy_(opt.filter_policy),
+      prefix_extractor_(opt.prefix_extractor),
+      whole_key_filtering_(opt.whole_key_filtering),
+      comparator_(internal_comparator) {}

 void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
  uint64_t filter_index = (block_offset / kFilterBase);
--- a/table/filter_block.h
+++ b/table/filter_block.h
@@ -35,7 +35,8 @@ class FilterPolicy;
 //      (StartBlock AddKey*)* Finish
 class FilterBlockBuilder {
 public:
-  explicit FilterBlockBuilder(const Options& opt);
+  explicit FilterBlockBuilder(const Options& opt,
+                              const Comparator* internal_comparator);

  void StartBlock(uint64_t block_offset);
  void AddKey(const Slice& key);
--- a/table/filter_block_test.cc
+++ b/table/filter_block_test.cc
@@ -55,7 +55,7 @@ class FilterBlockTest {
 };

 TEST(FilterBlockTest, EmptyBuilder) {
-  FilterBlockBuilder builder(options_);
+  FilterBlockBuilder builder(options_, options_.comparator);
  Slice block = builder.Finish();
  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
  FilterBlockReader reader(options_, block);
@@ -64,7 +64,7 @@ TEST(FilterBlockTest, EmptyBuilder) {
 }

 TEST(FilterBlockTest, SingleChunk) {
-  FilterBlockBuilder builder(options_);
+  FilterBlockBuilder builder(options_, options_.comparator);
  builder.StartBlock(100);
  builder.AddKey("foo");
  builder.AddKey("bar");
@@ -85,7 +85,7 @@ TEST(FilterBlockTest, SingleChunk) {
 }

 TEST(FilterBlockTest, MultiChunk) {
-  FilterBlockBuilder builder(options_);
+  FilterBlockBuilder builder(options_, options_.comparator);

  // First filter
  builder.StartBlock(0);
--- a/table/format.cc
+++ b/table/format.cc
@@ -34,6 +34,7 @@ Status BlockHandle::DecodeFrom(Slice* input) {
    return Status::Corruption("bad block handle");
  }
 }
+const BlockHandle BlockHandle::kNullBlockHandle(0, 0);

 void Footer::EncodeTo(std::string* dst) const {
 #ifndef NDEBUG
@@ -72,6 +73,30 @@ Status Footer::DecodeFrom(Slice* input) {
  return result;
 }

+Status ReadFooterFromFile(RandomAccessFile* file,
+                          uint64_t file_size,
+                          Footer* footer) {
+  if (file_size < Footer::kEncodedLength) {
+    return Status::InvalidArgument("file is too short to be an sstable");
+  }
+
+  char footer_space[Footer::kEncodedLength];
+  Slice footer_input;
+  Status s = file->Read(file_size - Footer::kEncodedLength,
+                        Footer::kEncodedLength,
+                        &footer_input,
+                        footer_space);
+  if (!s.ok()) return s;
+
+  // Check that we actually read the whole footer from the file. It may be
+  // that size isn't correct.
+  if (footer_input.size() != Footer::kEncodedLength) {
+    return Status::InvalidArgument("file is too short to be an sstable");
+  }
+
+  return footer->DecodeFrom(&footer_input);
+}
+
 Status ReadBlockContents(RandomAccessFile* file,
                         const ReadOptions& options,
                         const BlockHandle& handle,
--- a/table/format.h
+++ b/table/format.h
@@ -26,6 +26,7 @@ struct ReadOptions;
 class BlockHandle {
 public:
  BlockHandle();
+  BlockHandle(uint64_t offset, uint64_t size);

  // The offset of the block in the file.
  uint64_t offset() const { return offset_; }
@@ -38,19 +39,36 @@ class BlockHandle {
  void EncodeTo(std::string* dst) const;
  Status DecodeFrom(Slice* input);

+  // if the block handle's offset and size are both "0", we will view it
+  // as a null block handle that points to no where.
+  bool IsNull() const {
+    return offset_ == 0 && size_ == 0;
+  }
+
+  static const BlockHandle& NullBlockHandle() {
+    return kNullBlockHandle;
+  }
+
  // Maximum encoding length of a BlockHandle
  enum { kMaxEncodedLength = 10 + 10 };

 private:
-  uint64_t offset_;
-  uint64_t size_;
+  uint64_t offset_ = 0;
+  uint64_t size_ = 0;
+
+  static const BlockHandle kNullBlockHandle;
 };

 // Footer encapsulates the fixed information stored at the tail
 // end of every table file.
 class Footer {
 public:
-  Footer() { }
+  // @table_magic_number serves two purposes:
+  //  1. Identify different types of the tables.
+  //  2. Help us to identify if a given file is a valid sst.
+  Footer(uint64_t table_magic_number) :
+      kTableMagicNumber(table_magic_number) {
+  }

  // The block handle for the metaindex block of the table
  const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
@@ -77,12 +95,13 @@ class Footer {
 private:
  BlockHandle metaindex_handle_;
  BlockHandle index_handle_;
+  const uint64_t kTableMagicNumber;
 };

-// kTableMagicNumber was picked by running
-//    echo http://code.google.com/p/leveldb/ | sha1sum
-// and taking the leading 64 bits.
-static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
+// Read the footer from file
+Status ReadFooterFromFile(RandomAccessFile* file,
+                          uint64_t file_size,
+                          Footer* footer);

 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
@@ -115,8 +134,13 @@ extern Status UncompressBlockContents(const char* data,
 // Implementation details follow.  Clients should ignore,

 inline BlockHandle::BlockHandle()
-    : offset_(~static_cast<uint64_t>(0)),
-      size_(~static_cast<uint64_t>(0)) {
+    : BlockHandle(~static_cast<uint64_t>(0),
+                  ~static_cast<uint64_t>(0)) {
+}
+
+inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size)
+    : offset_(offset),
+      size_(size) {
 }

 }  // namespace rocksdb
--- a/table/merger.cc
+++ b/table/merger.cc
@@ -11,8 +11,11 @@

 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
 #include "table/iter_heap.h"
 #include "table/iterator_wrapper.h"
+#include "util/stop_watch.h"
+#include "util/perf_context_imp.h"

 #include <vector>

@@ -22,10 +25,13 @@ namespace {

 class MergingIterator : public Iterator {
 public:
-  MergingIterator(const Comparator* comparator, Iterator** children, int n)
+  MergingIterator(Env* const env, const Comparator* comparator,
+                  Iterator** children, int n)
      : comparator_(comparator),
        children_(n),
        current_(nullptr),
+        use_heap_(true),
+        env_(env),
        direction_(kForward),
        maxHeap_(NewMaxIterHeap(comparator_)),
        minHeap_ (NewMinIterHeap(comparator_)) {
@@ -70,15 +76,52 @@ class MergingIterator : public Iterator {
  }

  virtual void Seek(const Slice& target) {
-    ClearHeaps();
+    // Invalidate the heap.
+    use_heap_ = false;
+    IteratorWrapper* first_child = nullptr;
+    StopWatchNano child_seek_timer(env_, false);
+    StopWatchNano min_heap_timer(env_, false);
    for (auto& child : children_) {
+      StartPerfTimer(&child_seek_timer);
      child.Seek(target);
+      BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer);
+      BumpPerfCount(&perf_context.seek_child_seek_count);
+
      if (child.Valid()) {
-        minHeap_.push(&child);
+        // This child has valid key
+        if (!use_heap_) {
+          if (first_child == nullptr) {
+            // It's the first child has valid key. Only put it int
+            // current_. Now the values in the heap should be invalid.
+            first_child = &child;
+          } else {
+            // We have more than one children with valid keys. Initialize
+            // the heap and put the first child into the heap.
+            StartPerfTimer(&min_heap_timer);
+            ClearHeaps();
+            BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+            StartPerfTimer(&min_heap_timer);
+            minHeap_.push(first_child);
+            BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+          }
+        }
+        if (use_heap_) {
+          StartPerfTimer(&min_heap_timer);
+          minHeap_.push(&child);
+          BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+        }
      }
    }
-    FindSmallest();
-    direction_ = kForward;
+    if (use_heap_) {
+      // If heap is valid, need to put the smallest key to curent_.
+      StartPerfTimer(&min_heap_timer);
+      FindSmallest();
+      BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+    } else {
+      // The heap is not valid, then the current_ iterator is the first
+      // one, or null if there is no first child.
+      current_ = first_child;
+    }
  }

  virtual void Next() {
@@ -109,10 +152,14 @@ class MergingIterator : public Iterator {
    // as the current points to the current record. move the iterator forward.
    // and if it is valid add it to the heap.
    current_->Next();
-    if (current_->Valid()){
-      minHeap_.push(current_);
+    if (use_heap_) {
+      if (current_->Valid()) {
+        minHeap_.push(current_);
+      }
+      FindSmallest();
+    } else if (!current_->Valid()) {
+      current_ = nullptr;
    }
-    FindSmallest();
  }

  virtual void Prev() {
@@ -178,6 +225,11 @@ class MergingIterator : public Iterator {
  const Comparator* comparator_;
  std::vector<IteratorWrapper> children_;
  IteratorWrapper* current_;
+  // If the value is true, both of iterators in the heap and current_
+  // contain valid rows. If it is false, only current_ can possibly contain
+  // valid rows.
+  bool use_heap_;
+  Env* const env_;
  // Which direction is the iterator moving?
  enum Direction {
    kForward,
@@ -189,6 +241,7 @@ class MergingIterator : public Iterator {
 };

 void MergingIterator::FindSmallest() {
+  assert(use_heap_);
  if (minHeap_.empty()) {
    current_ = nullptr;
  } else {
@@ -199,6 +252,7 @@ void MergingIterator::FindSmallest() {
 }

 void MergingIterator::FindLargest() {
+  assert(use_heap_);
  if (maxHeap_.empty()) {
    current_ = nullptr;
  } else {
@@ -209,19 +263,21 @@ void MergingIterator::FindLargest() {
 }

 void MergingIterator::ClearHeaps() {
+  use_heap_ = true;
  maxHeap_ = NewMaxIterHeap(comparator_);
  minHeap_ = NewMinIterHeap(comparator_);
 }
 }  // namespace

-Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
+Iterator* NewMergingIterator(Env* const env, const Comparator* cmp,
+                             Iterator** list, int n) {
  assert(n >= 0);
  if (n == 0) {
    return NewEmptyIterator();
  } else if (n == 1) {
    return list[0];
  } else {
-    return new MergingIterator(cmp, list, n);
+    return new MergingIterator(env, cmp, list, n);
  }
 }

--- a/table/merger.h
+++ b/table/merger.h
@@ -13,6 +13,7 @@ namespace rocksdb {

 class Comparator;
 class Iterator;
+class Env;

 // Return an iterator that provided the union of the data in
 // children[0,n-1].  Takes ownership of the child iterators and
@@ -22,7 +23,8 @@ class Iterator;
 // key is present in K child iterators, it will be yielded K times.
 //
 // REQUIRES: n >= 0
-extern Iterator* NewMergingIterator(
-    const Comparator* comparator, Iterator** children, int n);
+extern Iterator* NewMergingIterator(Env* const env,
+                                    const Comparator* comparator,
+                                    Iterator** children, int n);

 }  // namespace rocksdb
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -0,0 +1,286 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/meta_blocks.h"
+
+#include <map>
+
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/format.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+MetaIndexBuilder::MetaIndexBuilder()
+    : meta_index_block_(
+        new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
+}
+
+void MetaIndexBuilder::Add(const std::string& key,
+                           const BlockHandle& handle) {
+  std::string handle_encoding;
+  handle.EncodeTo(&handle_encoding);
+  meta_block_handles_.insert({key, handle_encoding});
+}
+
+Slice MetaIndexBuilder::Finish() {
+  for (const auto& metablock : meta_block_handles_) {
+    meta_index_block_->Add(metablock.first, metablock.second);
+  }
+  return meta_index_block_->Finish();
+}
+
+PropertyBlockBuilder::PropertyBlockBuilder()
+  : properties_block_(
+      new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
+}
+
+void PropertyBlockBuilder::Add(const std::string& name,
+                               const std::string& val) {
+  props_.insert({name, val});
+}
+
+void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
+  assert(props_.find(name) == props_.end());
+
+  std::string dst;
+  PutVarint64(&dst, val);
+
+  Add(name, dst);
+}
+
+void PropertyBlockBuilder::Add(
+    const UserCollectedProperties& user_collected_properties) {
+  for (const auto& prop : user_collected_properties) {
+    Add(prop.first, prop.second);
+  }
+}
+
+void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
+  Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
+  Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
+  Add(TablePropertiesNames::kDataSize, props.data_size);
+  Add(TablePropertiesNames::kIndexSize, props.index_size);
+  Add(TablePropertiesNames::kNumEntries, props.num_entries);
+  Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
+  Add(TablePropertiesNames::kFilterSize, props.filter_size);
+  Add(TablePropertiesNames::kFormatVersion, props.format_version);
+  Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
+
+  if (!props.filter_policy_name.empty()) {
+    Add(TablePropertiesNames::kFilterPolicy,
+        props.filter_policy_name);
+  }
+}
+
+Slice PropertyBlockBuilder::Finish() {
+  for (const auto& prop : props_) {
+    properties_block_->Add(prop.first, prop.second);
+  }
+
+  return properties_block_->Finish();
+}
+
+void LogPropertiesCollectionError(
+    Logger* info_log, const std::string& method, const std::string& name) {
+  assert(method == "Add" || method == "Finish");
+
+  std::string msg =
+    "[Warning] encountered error when calling TablePropertiesCollector::" +
+    method + "() with collector name: " + name;
+  Log(info_log, "%s", msg.c_str());
+}
+
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key,
+    const Slice& value,
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log) {
+  bool all_succeeded = true;
+  for (auto collector : collectors) {
+    Status s = collector->Add(key, value);
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(
+          info_log, "Add", /* method */ collector->Name()
+      );
+    }
+  }
+  return all_succeeded;
+}
+
+bool NotifyCollectTableCollectorsOnFinish(
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log,
+    PropertyBlockBuilder* builder) {
+  bool all_succeeded = true;
+  for (auto collector : collectors) {
+    UserCollectedProperties user_collected_properties;
+    Status s = collector->Finish(&user_collected_properties);
+
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(
+          info_log, "Finish", /* method */ collector->Name()
+      );
+    } else {
+      builder->Add(user_collected_properties);
+    }
+  }
+
+  return all_succeeded;
+}
+
+Status ReadProperties(
+    const Slice& handle_value,
+    RandomAccessFile* file,
+    Env* env,
+    Logger* logger,
+    TableProperties* table_properties) {
+  assert(table_properties);
+
+  Slice v = handle_value;
+  BlockHandle handle;
+  if (!handle.DecodeFrom(&v).ok()) {
+    return Status::InvalidArgument("Failed to decode properties block handle");
+  }
+
+  BlockContents block_contents;
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+  Status s = ReadBlockContents(
+      file,
+      read_options,
+      handle,
+      &block_contents,
+      env,
+      false
+  );
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  Block properties_block(block_contents);
+  std::unique_ptr<Iterator> iter(
+      properties_block.NewIterator(BytewiseComparator())
+  );
+
+  // All pre-defined properties of type uint64_t
+  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+    { TablePropertiesNames::kDataSize, &table_properties->data_size },
+    { TablePropertiesNames::kIndexSize, &table_properties->index_size },
+    { TablePropertiesNames::kFilterSize, &table_properties->filter_size },
+    { TablePropertiesNames::kRawKeySize, &table_properties->raw_key_size },
+    { TablePropertiesNames::kRawValueSize, &table_properties->raw_value_size },
+    { TablePropertiesNames::kNumDataBlocks,
+      &table_properties->num_data_blocks },
+    { TablePropertiesNames::kNumEntries, &table_properties->num_entries },
+    { TablePropertiesNames::kFormatVersion, &table_properties->format_version },
+    { TablePropertiesNames::kFixedKeyLen, &table_properties->fixed_key_len },
+  };
+
+  std::string last_key;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    s = iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    auto key = iter->key().ToString();
+    // properties block is strictly sorted with no duplicate key.
+    assert(
+        last_key.empty() ||
+        BytewiseComparator()->Compare(key, last_key) > 0
+    );
+    last_key = key;
+
+    auto raw_val = iter->value();
+    auto pos = predefined_uint64_properties.find(key);
+
+    if (pos != predefined_uint64_properties.end()) {
+      // handle predefined rocksdb properties
+      uint64_t val;
+      if (!GetVarint64(&raw_val, &val)) {
+        // skip malformed value
+        auto error_msg =
+          "[Warning] detect malformed value in properties meta-block:"
+          "\tkey: " + key + "\tval: " + raw_val.ToString();
+        Log(logger, "%s", error_msg.c_str());
+        continue;
+      }
+      *(pos->second) = val;
+    } else if (key == TablePropertiesNames::kFilterPolicy) {
+      table_properties->filter_policy_name = raw_val.ToString();
+    } else {
+      // handle user-collected properties
+      table_properties->user_collected_properties.insert(
+          std::make_pair(key, raw_val.ToString())
+      );
+    }
+  }
+
+  return s;
+}
+
+Status ReadTableProperties(
+    RandomAccessFile* file,
+    uint64_t file_size,
+    uint64_t table_magic_number,
+    Env* env,
+    Logger* info_log,
+    TableProperties* properties) {
+  // -- Read metaindex block
+  Footer footer(table_magic_number);
+  auto s = ReadFooterFromFile(file, file_size, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+  s = ReadBlockContents(
+      file,
+      read_options,
+      metaindex_handle,
+      &metaindex_contents,
+      env,
+      false
+  );
+  if (!s.ok()) {
+    return s;
+  }
+  Block metaindex_block(metaindex_contents);
+  std::unique_ptr<Iterator> meta_iter(
+      metaindex_block.NewIterator(BytewiseComparator())
+  );
+
+  // -- Read property block
+  meta_iter->Seek(kPropertiesBlock);
+  TableProperties table_properties;
+  if (meta_iter->Valid() &&
+      meta_iter->key() == kPropertiesBlock &&
+      meta_iter->status().ok()) {
+    s = ReadProperties(
+        meta_iter->value(),
+        file,
+        env,
+        info_log,
+        properties
+    );
+  } else {
+    s = Status::Corruption(
+        "Unable to read the property block from the plain table"
+    );
+  }
+
+  return s;
+}
+
+
+}  // namespace rocksdb
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -0,0 +1,121 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_builder.h"
+
+namespace rocksdb {
+
+class BlockBuilder;
+class BlockHandle;
+class Env;
+class Logger;
+class RandomAccessFile;
+struct TableProperties;
+
+// An STL style comparator that does the bytewise comparator comparasion
+// internally.
+struct BytewiseLessThan {
+  bool operator()(const std::string& key1, const std::string& key2) const {
+    // smaller entries will be placed in front.
+    return comparator->Compare(key1, key2) <= 0;
+  }
+
+  const Comparator* comparator = BytewiseComparator();
+};
+
+// When writing to a block that requires entries to be sorted by
+// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
+// before writng to store.
+typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
+
+class MetaIndexBuilder {
+ public:
+  MetaIndexBuilder(const MetaIndexBuilder&) = delete;
+  MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete;
+
+  MetaIndexBuilder();
+  void Add(const std::string& key, const BlockHandle& handle);
+
+  // Write all the added key/value pairs to the block and return the contents
+  // of the block.
+  Slice Finish();
+
+ private:
+  // store the sorted key/handle of the metablocks.
+  BytewiseSortedMap meta_block_handles_;
+  std::unique_ptr<BlockBuilder> meta_index_block_;
+};
+
+class PropertyBlockBuilder {
+ public:
+  PropertyBlockBuilder(const PropertyBlockBuilder&) = delete;
+  PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete;
+
+  PropertyBlockBuilder();
+
+  void AddTableProperty(const TableProperties& props);
+  void Add(const std::string& key, uint64_t value);
+  void Add(const std::string& key, const std::string& value);
+  void Add(const UserCollectedProperties& user_collected_properties);
+
+  // Write all the added entries to the block and return the block contents
+  Slice Finish();
+
+ private:
+  std::unique_ptr<BlockBuilder> properties_block_;
+  BytewiseSortedMap props_;
+};
+
+// Were we encounter any error occurs during user-defined statistics collection,
+// we'll write the warning message to info log.
+void LogPropertiesCollectionError(
+    Logger* info_log, const std::string& method, const std::string& name);
+
+// Utility functions help table builder to trigger batch events for user
+// defined property collectors.
+// Return value indicates if there is any error occurred; if error occurred,
+// the warning message will be logged.
+// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
+// property collectors.
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key,
+    const Slice& value,
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log);
+
+// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
+// property collectors. The collected properties will be added to `builder`.
+bool NotifyCollectTableCollectorsOnFinish(
+    const Options::TablePropertiesCollectors& collectors,
+    Logger* info_log,
+    PropertyBlockBuilder* builder);
+
+// Read the properties from the table.
+Status ReadProperties(
+    const Slice& handle_value,
+    RandomAccessFile* file,
+    Env* env,
+    Logger* logger,
+    TableProperties* table_properties);
+
+// Directly read the properties from the properties block of a plain table.
+Status ReadTableProperties(
+    RandomAccessFile* file,
+    uint64_t file_size,
+    uint64_t table_magic_number,
+    Env* env,
+    Logger* info_log,
+    TableProperties* properties);
+
+}  // namespace rocksdb
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain_table_builder.h"
+
+#include <assert.h>
+#include <map>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "table/plain_table_factory.h"
+#include "db/dbformat.h"
+#include "table/block_builder.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+namespace {
+
+// a utility that helps writing block content to the file
+//   @offset will advance if @block_contents was successfully written.
+//   @block_handle the block handle this particular block.
+Status WriteBlock(
+    const Slice& block_contents,
+    WritableFile* file,
+    uint64_t* offset,
+    BlockHandle* block_handle) {
+  block_handle->set_offset(*offset);
+  block_handle->set_size(block_contents.size());
+  Status s = file->Append(block_contents);
+
+  if (s.ok()) {
+    *offset += block_contents.size();
+  }
+  return s;
+}
+
+}  // namespace
+
+// kPlainTableMagicNumber was picked by running
+//    echo rocksdb.plain.table | sha1sum
+// and taking the leading 64 bits.
+extern const uint64_t kPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
+
+PlainTableBuilder::PlainTableBuilder(const Options& options,
+                                     WritableFile* file,
+                                     uint32_t user_key_len) :
+    options_(options), file_(file), user_key_len_(user_key_len) {
+  properties_.fixed_key_len = user_key_len;
+
+  // for plain table, we put all the data in a big chuck.
+  properties_.num_data_blocks = 1;
+  // emphasize that currently plain table doesn't have persistent index or
+  // filter block.
+  properties_.index_size = 0;
+  properties_.filter_size = 0;
+  properties_.format_version = 0;
+}
+
+PlainTableBuilder::~PlainTableBuilder() {
+}
+
+void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
+  size_t user_key_size = key.size() - 8;
+  assert(user_key_len_ == 0 || user_key_size == user_key_len_);
+
+  if (!IsFixedLength()) {
+    // Write key length
+    key_size_str_.clear();
+    PutVarint32(&key_size_str_, user_key_size);
+    file_->Append(key_size_str_);
+    offset_ += key_size_str_.length();
+  }
+
+  // Write key
+  ParsedInternalKey parsed_key;
+  if (!ParseInternalKey(key, &parsed_key)) {
+    status_ = Status::Corruption(Slice());
+    return;
+  }
+  if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
+    file_->Append(Slice(key.data(), user_key_size));
+    char tmp_char = PlainTableFactory::kValueTypeSeqId0;
+    file_->Append(Slice(&tmp_char, 1));
+    offset_ += key.size() - 7;
+  } else {
+    file_->Append(key);
+    offset_ += key.size();
+  }
+
+  // Write value length
+  value_size_str_.clear();
+  int value_size = value.size();
+  PutVarint32(&value_size_str_, value_size);
+  file_->Append(value_size_str_);
+
+  // Write value
+  file_->Append(value);
+  offset_ += value_size + value_size_str_.length();
+
+  properties_.num_entries++;
+  properties_.raw_key_size += key.size();
+  properties_.raw_value_size += value.size();
+
+  // notify property collectors
+  NotifyCollectTableCollectorsOnAdd(
+      key,
+      value,
+      options_.table_properties_collectors,
+      options_.info_log.get()
+  );
+}
+
+Status PlainTableBuilder::status() const { return status_; }
+
+Status PlainTableBuilder::Finish() {
+  assert(!closed_);
+  closed_ = true;
+
+  properties_.data_size = offset_;
+
+  // Write the following blocks
+  //  1. [meta block: properties]
+  //  2. [metaindex block]
+  //  3. [footer]
+  MetaIndexBuilder meta_index_builer;
+
+  PropertyBlockBuilder property_block_builder;
+  // -- Add basic properties
+  property_block_builder.AddTableProperty(properties_);
+
+  // -- Add user collected properties
+  NotifyCollectTableCollectorsOnFinish(
+      options_.table_properties_collectors,
+      options_.info_log.get(),
+      &property_block_builder
+  );
+
+  // -- Write property block
+  BlockHandle property_block_handle;
+  auto s = WriteBlock(
+      property_block_builder.Finish(),
+      file_,
+      &offset_,
+      &property_block_handle
+  );
+  if (!s.ok()) {
+    return s;
+  }
+  meta_index_builer.Add(kPropertiesBlock, property_block_handle);
+
+  // -- write metaindex block
+  BlockHandle metaindex_block_handle;
+  s = WriteBlock(
+      meta_index_builer.Finish(),
+      file_,
+      &offset_,
+      &metaindex_block_handle
+  );
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Write Footer
+  Footer footer(kPlainTableMagicNumber);
+  footer.set_metaindex_handle(metaindex_block_handle);
+  footer.set_index_handle(BlockHandle::NullBlockHandle());
+  std::string footer_encoding;
+  footer.EncodeTo(&footer_encoding);
+  s = file_->Append(footer_encoding);
+  if (s.ok()) {
+    offset_ += footer_encoding.size();
+  }
+
+  return s;
+}
+
+void PlainTableBuilder::Abandon() {
+  closed_ = true;
+}
+
+uint64_t PlainTableBuilder::NumEntries() const {
+  return properties_.num_entries;
+}
+
+uint64_t PlainTableBuilder::FileSize() const {
+  return offset_;
+}
+
+}  // namespace rocksdb
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+
+#pragma once
+#include <stdint.h>
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "table/table_builder.h"
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+class TableBuilder;
+
+class PlainTableBuilder: public TableBuilder {
+public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  PlainTableBuilder(const Options& options, WritableFile* file,
+                    uint32_t user_key_size);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~PlainTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+private:
+  Options options_;
+  WritableFile* file_;
+  uint64_t offset_ = 0;
+  Status status_;
+  TableProperties properties_;
+
+  const size_t user_key_len_;
+  bool closed_ = false;  // Either Finish() or Abandon() has been called.
+
+  std::string key_size_str_;
+  std::string value_size_str_;
+
+  bool IsFixedLength() const {
+    return user_key_len_ > 0;
+  }
+
+  // No copying allowed
+  PlainTableBuilder(const PlainTableBuilder&) = delete;
+  void operator=(const PlainTableBuilder&) = delete;
+};
+
+}  // namespace rocksdb
+
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain_table_factory.h"
+
+#include <memory>
+#include <stdint.h>
+#include "db/dbformat.h"
+#include "table/plain_table_builder.h"
+#include "table/plain_table_reader.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+Status PlainTableFactory::NewTableReader(const Options& options,
+                                         const EnvOptions& soptions,
+                                         const InternalKeyComparator& icomp,
+                                         unique_ptr<RandomAccessFile>&& file,
+                                         uint64_t file_size,
+                                         unique_ptr<TableReader>* table) const {
+  return PlainTableReader::Open(options, soptions, icomp, std::move(file),
+                                file_size, table, bloom_bits_per_key_,
+                                hash_table_ratio_);
+}
+
+TableBuilder* PlainTableFactory::NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type) const {
+  return new PlainTableBuilder(options, file, user_key_len_);
+}
+
+extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
+                                          int bloom_bits_per_key,
+                                          double hash_table_ratio) {
+  return new PlainTableFactory(user_key_len, bloom_bits_per_key,
+                               hash_table_ratio);
+}
+
+}  // namespace rocksdb
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+struct Options;
+struct EnvOptions;
+
+using std::unique_ptr;
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+// IndexedTable requires fixed length key, configured as a constructor
+// parameter of the factory class. Output file format:
+// +-------------+-----------------+
+// | version     | user_key_length |
+// +------------++------------------------------+  <= key1 offset
+// | [key_size] |  key1       | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value1                                     |
+// |                                            |
+// +----------------------------------------+---+  <= key2 offset
+// | [key_size] |  key2       | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+
+// If user_key_length = kPlainTableVariableLength, it means the key is variable
+// length, there will be an extra field for key size encoded before every key.
+class PlainTableFactory : public TableFactory {
+ public:
+  ~PlainTableFactory() {}
+  // user_key_size is the length of the user key. If it is set to be
+  // kPlainTableVariableLength, then it means variable length. Otherwise, all
+  // the keys need to have the fix length of this value. bloom_bits_per_key is
+  // number of bits used for bloom filer per key. hash_table_ratio is
+  // the desired utilization of the hash table used for prefix hashing.
+  // hash_table_ratio = number of prefixes / #buckets in the hash table
+  explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
+                             int bloom_bits_per_key = 0,
+                             double hash_table_ratio = 0.75)
+      : user_key_len_(user_key_len),
+        bloom_bits_per_key_(bloom_bits_per_key),
+        hash_table_ratio_(hash_table_ratio) {}
+  const char* Name() const override { return "PlainTable"; }
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table) const override;
+  TableBuilder* NewTableBuilder(const Options& options,
+                                const InternalKeyComparator& icomparator,
+                                WritableFile* file,
+                                CompressionType compression_type) const
+      override;
+
+  static const char kValueTypeSeqId0 = 0xFF;
+
+ private:
+  uint32_t user_key_len_;
+  int bloom_bits_per_key_;
+  double hash_table_ratio_;
+};
+
+}  // namespace rocksdb
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -0,0 +1,695 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain_table_reader.h"
+
+#include <string>
+
+#include "db/dbformat.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+
+#include "table/block.h"
+#include "table/filter_block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/two_level_iterator.h"
+#include "table/plain_table_factory.h"
+
+#include "util/coding.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/histogram.h"
+#include "util/murmurhash.h"
+#include "util/perf_context_imp.h"
+#include "util/stop_watch.h"
+
+
+namespace rocksdb {
+
+namespace {
+
+inline uint32_t GetSliceHash(Slice const& s) {
+  return Hash(s.data(), s.size(), 397) ;
+}
+
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+  return hash % num_buckets;
+}
+
+}  // namespace
+
+// Iterator to iterate IndexedTable
+class PlainTableIterator : public Iterator {
+ public:
+  explicit PlainTableIterator(PlainTableReader* table);
+  ~PlainTableIterator();
+
+  bool Valid() const;
+
+  void SeekToFirst();
+
+  void SeekToLast();
+
+  void Seek(const Slice& target);
+
+  void Next();
+
+  void Prev();
+
+  Slice key() const;
+
+  Slice value() const;
+
+  Status status() const;
+
+ private:
+  PlainTableReader* table_;
+  uint32_t offset_;
+  uint32_t next_offset_;
+  Slice key_;
+  Slice value_;
+  Status status_;
+  std::string tmp_str_;
+  // No copying allowed
+  PlainTableIterator(const PlainTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
+                                   const InternalKeyComparator& icomparator,
+                                   uint64_t file_size, int bloom_bits_per_key,
+                                   double hash_table_ratio,
+                                   const TableProperties& table_properties)
+    : soptions_(storage_options),
+      internal_comparator_(icomparator),
+      file_size_(file_size),
+      kHashTableRatio(hash_table_ratio),
+      kBloomBitsPerKey(bloom_bits_per_key),
+      table_properties_(table_properties),
+      data_end_offset_(table_properties_.data_size),
+      user_key_len_(table_properties.fixed_key_len) {}
+
+PlainTableReader::~PlainTableReader() {
+  delete[] hash_table_;
+  delete[] sub_index_;
+  delete bloom_;
+}
+
+Status PlainTableReader::Open(const Options& options,
+                              const EnvOptions& soptions,
+                              const InternalKeyComparator& internal_comparator,
+                              unique_ptr<RandomAccessFile>&& file,
+                              uint64_t file_size,
+                              unique_ptr<TableReader>* table_reader,
+                              const int bloom_bits_per_key,
+                              double hash_table_ratio) {
+  assert(options.allow_mmap_reads);
+
+  if (file_size > kMaxFileSize) {
+    return Status::NotSupported("File is too large for PlainTableReader!");
+  }
+
+  TableProperties table_properties;
+  auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                               options.env, options.info_log.get(),
+                               &table_properties);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
+      soptions, internal_comparator, file_size, bloom_bits_per_key,
+      hash_table_ratio, table_properties));
+  new_reader->file_ = std::move(file);
+  new_reader->options_ = options;
+
+  // -- Populate Index
+  s = new_reader->PopulateIndex();
+  if (!s.ok()) {
+    return s;
+  }
+
+  *table_reader = std::move(new_reader);
+  return s;
+}
+
+void PlainTableReader::SetupForCompaction() {
+}
+
+bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) {
+  return true;
+}
+
+Iterator* PlainTableReader::NewIterator(const ReadOptions& options) {
+  return new PlainTableIterator(this);
+}
+
+struct PlainTableReader::IndexRecord {
+  uint32_t hash; // hash of the prefix
+  uint32_t offset; // offset of a row
+  IndexRecord* next;
+};
+
+// Helper class to track all the index records
+class PlainTableReader::IndexRecordList {
+ public:
+  explicit IndexRecordList(size_t num_records_per_group)
+      : kNumRecordsPerGroup(num_records_per_group),
+        current_group_(nullptr),
+        num_records_in_current_group_(num_records_per_group) {}
+
+  ~IndexRecordList() {
+    for (size_t i = 0; i < groups_.size(); i++) {
+      delete[] groups_[i];
+    }
+  }
+
+  void AddRecord(murmur_t hash, uint32_t offset) {
+    if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+      current_group_ = AllocateNewGroup();
+      num_records_in_current_group_ = 0;
+    }
+    auto& new_record = current_group_[num_records_in_current_group_++];
+    new_record.hash = hash;
+    new_record.offset = offset;
+    new_record.next = nullptr;
+  }
+
+  size_t GetNumRecords() const {
+    return (groups_.size() - 1) * kNumRecordsPerGroup +
+           num_records_in_current_group_;
+  }
+  IndexRecord* At(size_t index) {
+    return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
+  }
+
+ private:
+  IndexRecord* AllocateNewGroup() {
+    IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
+    groups_.push_back(result);
+    return result;
+  }
+
+  const size_t kNumRecordsPerGroup;
+  IndexRecord* current_group_;
+  // List of arrays allocated
+  std::vector<IndexRecord*> groups_;
+  size_t num_records_in_current_group_;
+};
+
+int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) {
+  Slice prev_key_prefix_slice;
+  uint32_t prev_key_prefix_hash = 0;
+  uint32_t pos = data_start_offset_;
+  int key_index_within_prefix = 0;
+  bool is_first_record = true;
+  HistogramImpl keys_per_prefix_hist;
+  // Need map to be ordered to make sure sub indexes generated
+  // are in order.
+
+  int num_prefixes = 0;
+  while (pos < data_end_offset_) {
+    uint32_t key_offset = pos;
+    ParsedInternalKey key;
+    Slice value_slice;
+    status_ = Next(pos, &key, &value_slice, pos);
+    Slice key_prefix_slice = GetPrefix(key);
+
+    if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+      ++num_prefixes;
+      if (!is_first_record) {
+        keys_per_prefix_hist.Add(key_index_within_prefix);
+      }
+      key_index_within_prefix = 0;
+      prev_key_prefix_slice = key_prefix_slice;
+      prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
+    }
+
+    if (key_index_within_prefix++ % kIndexIntervalForSamePrefixKeys == 0) {
+      // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+      record_list->AddRecord(prev_key_prefix_hash, key_offset);
+    }
+    is_first_record = false;
+  }
+
+  keys_per_prefix_hist.Add(key_index_within_prefix);
+  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
+      keys_per_prefix_hist.ToString().c_str());
+
+  return num_prefixes;
+}
+
+void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
+  delete[] hash_table_;
+
+  if (kBloomBitsPerKey > 0) {
+    bloom_ = new DynamicBloom(num_prefixes * kBloomBitsPerKey);
+  }
+  double hash_table_size_multipier =
+      (kHashTableRatio > 1.0) ? 1.0 : 1.0 / kHashTableRatio;
+  hash_table_size_ = num_prefixes * hash_table_size_multipier + 1;
+  hash_table_ = new uint32_t[hash_table_size_];
+}
+
+size_t PlainTableReader::BucketizeIndexesAndFillBloom(
+    IndexRecordList& record_list, int num_prefixes,
+    std::vector<IndexRecord*>* hash_to_offsets,
+    std::vector<uint32_t>* bucket_count) {
+  size_t sub_index_size_needed = 0;
+  bool first = true;
+  uint32_t prev_hash = 0;
+  size_t num_records = record_list.GetNumRecords();
+  for (size_t i = 0; i < num_records; i++) {
+    IndexRecord* index_record = record_list.At(i);
+    uint32_t cur_hash = index_record->hash;
+    if (first || prev_hash != cur_hash) {
+      prev_hash = cur_hash;
+      first = false;
+      if (bloom_) {
+        bloom_->AddHash(cur_hash);
+      }
+    }
+    uint32_t bucket = GetBucketIdFromHash(cur_hash, hash_table_size_);
+    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+    index_record->next = prev_bucket_head;
+    (*hash_to_offsets)[bucket] = index_record;
+    auto& item_count = (*bucket_count)[bucket];
+    if (item_count > 0) {
+      if (item_count == 1) {
+        sub_index_size_needed += kOffsetLen + 1;
+      }
+      if (item_count == 127) {
+        // Need more than one byte for length
+        sub_index_size_needed++;
+      }
+      sub_index_size_needed += kOffsetLen;
+    }
+    item_count++;
+  }
+  return sub_index_size_needed;
+}
+
+void PlainTableReader::FillIndexes(
+    size_t sub_index_size_needed,
+    const std::vector<IndexRecord*>& hash_to_offsets,
+    const std::vector<uint32_t>& bucket_count) {
+  Log(options_.info_log, "Reserving %zu bytes for sub index",
+      sub_index_size_needed);
+  // 8 bytes buffer for variable length size
+  size_t buffer_size = 8 * 8;
+  size_t buffer_used = 0;
+  sub_index_size_needed += buffer_size;
+  sub_index_ = new char[sub_index_size_needed];
+  size_t sub_index_offset = 0;
+  char* prev_ptr;
+  char* cur_ptr;
+  uint32_t* sub_index_ptr;
+  for (int i = 0; i < hash_table_size_; i++) {
+    uint32_t num_keys_for_bucket = bucket_count[i];
+    switch (num_keys_for_bucket) {
+    case 0:
+      // No key for bucket
+      hash_table_[i] = data_end_offset_;
+      break;
+    case 1:
+      // point directly to the file offset
+      hash_table_[i] = hash_to_offsets[i]->offset;
+      break;
+    default:
+      // point to second level indexes.
+      hash_table_[i] = sub_index_offset | kSubIndexMask;
+      prev_ptr = sub_index_ + sub_index_offset;
+      cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+      sub_index_offset += (cur_ptr - prev_ptr);
+      if (cur_ptr - prev_ptr > 2
+          || (cur_ptr - prev_ptr == 2 && num_keys_for_bucket <= 127)) {
+        // Need to resize sub_index. Exponentially grow buffer.
+        buffer_used += cur_ptr - prev_ptr - 1;
+        if (buffer_used + 4 > buffer_size) {
+          Log(options_.info_log, "Recalculate suffix_map length to %zu",
+              sub_index_size_needed);
+
+          sub_index_size_needed += buffer_size;
+          buffer_size *= 2;
+          char* new_sub_index = new char[sub_index_size_needed];
+          memcpy(new_sub_index, sub_index_, sub_index_offset);
+          delete[] sub_index_;
+          sub_index_ = new_sub_index;
+        }
+      }
+      sub_index_ptr = (uint32_t*) (sub_index_ + sub_index_offset);
+      IndexRecord* record = hash_to_offsets[i];
+      int j;
+      for (j = num_keys_for_bucket - 1; j >= 0 && record;
+           j--, record = record->next) {
+        sub_index_ptr[j] = record->offset;
+      }
+      assert(j == -1 && record == nullptr);
+      sub_index_offset += kOffsetLen * num_keys_for_bucket;
+      break;
+    }
+  }
+
+  Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
+      hash_table_size_, sub_index_size_needed);
+}
+
+Status PlainTableReader::PopulateIndex() {
+  // Get mmapped memory to file_data_.
+  Status s = file_->Read(0, file_size_, &file_data_, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+
+  IndexRecordList record_list(kRecordsPerGroup);
+  // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
+  // for a prefix (starting from the first one), generate a record of (hash,
+  // offset) and append it to IndexRecordList, which is a data structure created
+  // to store them.
+  int num_prefixes = PopulateIndexRecordList(&record_list);
+  // Calculated hash table and bloom filter size and allocate memory for indexes
+  // and bloom filter based on the number of prefixes.
+  AllocateIndexAndBloom(num_prefixes);
+
+  // Bucketize all the index records to a temp data structure, in which for
+  // each bucket, we generate a linked list of IndexRecord, in reversed order.
+  std::vector<IndexRecord*> hash_to_offsets(hash_table_size_, nullptr);
+  std::vector<uint32_t> bucket_count(hash_table_size_, 0);
+  size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
+      record_list, num_prefixes, &hash_to_offsets, &bucket_count);
+  // From the temp data structure, populate indexes.
+  FillIndexes(sub_index_size_needed, hash_to_offsets, bucket_count);
+
+  return Status::OK();
+}
+
+Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
+                                   uint32_t prefix_hash, bool& prefix_matched,
+                                   uint32_t& ret_offset) {
+  prefix_matched = false;
+  int bucket = GetBucketIdFromHash(prefix_hash, hash_table_size_);
+  uint32_t bucket_value = hash_table_[bucket];
+  if (bucket_value == data_end_offset_) {
+    ret_offset = data_end_offset_;
+    return Status::OK();
+  } else if ((bucket_value & kSubIndexMask) == 0) {
+    // point directly to the file
+    ret_offset = bucket_value;
+    return Status::OK();
+  }
+
+  // point to sub-index, need to do a binary search
+  uint32_t low = 0;
+  uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
+
+  const char* index_ptr = sub_index_ + prefix_index_offset;
+  uint32_t upper_bound = 0;
+  const uint32_t* base_ptr = (const uint32_t*) GetVarint32Ptr(index_ptr,
+                                                              index_ptr + 4,
+                                                              &upper_bound);
+  uint32_t high = upper_bound;
+  ParsedInternalKey mid_key;
+  ParsedInternalKey parsed_target;
+  if (!ParseInternalKey(target, &parsed_target)) {
+    return Status::Corruption(Slice());
+  }
+
+  // The key is between [low, high). Do a binary search between it.
+  while (high - low > 1) {
+    uint32_t mid = (high + low) / 2;
+    uint32_t file_offset = base_ptr[mid];
+    size_t tmp;
+    Status s = ReadKey(file_data_.data() + file_offset, &mid_key, tmp);
+    if (!s.ok()) {
+      return s;
+    }
+    int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
+    if (cmp_result < 0) {
+      low = mid;
+    } else {
+      if (cmp_result == 0) {
+        // Happen to have found the exact key or target is smaller than the
+        // first key after base_offset.
+        prefix_matched = true;
+        ret_offset = file_offset;
+        return Status::OK();
+      } else {
+        high = mid;
+      }
+    }
+  }
+  // Both of the key at the position low or low+1 could share the same
+  // prefix as target. We need to rule out one of them to avoid to go
+  // to the wrong prefix.
+  ParsedInternalKey low_key;
+  size_t tmp;
+  uint32_t low_key_offset = base_ptr[low];
+  Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, tmp);
+  if (GetPrefix(low_key) == prefix) {
+    prefix_matched = true;
+    ret_offset = low_key_offset;
+  } else if (low + 1 < upper_bound) {
+    // There is possible a next prefix, return it
+    prefix_matched = false;
+    ret_offset = base_ptr[low + 1];
+  } else {
+    // target is larger than a key of the last prefix in this bucket
+    // but with a different prefix. Key does not exist.
+    ret_offset = data_end_offset_;
+  }
+  return Status::OK();
+}
+
+bool PlainTableReader::MayHavePrefix(uint32_t hash) {
+  return bloom_ == nullptr || bloom_->MayContainHash(hash);
+}
+
+Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) {
+  return options_.prefix_extractor->Transform(target.user_key);
+}
+
+Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key,
+                                 size_t& bytes_read) {
+  const char* key_ptr = nullptr;
+  bytes_read = 0;
+  size_t user_key_size = 0;
+  if (IsFixedLength()) {
+    user_key_size = user_key_len_;
+    key_ptr = row_ptr;
+  } else {
+    uint32_t tmp_size = 0;
+    key_ptr = GetVarint32Ptr(row_ptr, file_data_.data() + data_end_offset_,
+                             &tmp_size);
+    if (key_ptr == nullptr) {
+      return Status::Corruption("Unable to read the next key");
+    }
+    user_key_size = (size_t)tmp_size;
+    bytes_read = key_ptr - row_ptr;
+  }
+  if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) {
+    return Status::Corruption("Unable to read the next key");
+  }
+
+  if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) {
+    // Special encoding for the row with seqID=0
+    key->user_key = Slice(key_ptr, user_key_size);
+    key->sequence = 0;
+    key->type = kTypeValue;
+    bytes_read += user_key_size + 1;
+  } else {
+    if (row_ptr + user_key_size + 8 >= file_data_.data() + data_end_offset_) {
+      return Status::Corruption("Unable to read the next key");
+    }
+    if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) {
+      return Status::Corruption(Slice());
+    }
+    bytes_read += user_key_size + 8;
+  }
+
+  return Status::OK();
+}
+
+Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key,
+                              Slice* value, uint32_t& next_offset) {
+  if (offset == data_end_offset_) {
+    next_offset = data_end_offset_;
+    return Status::OK();
+  }
+
+  if (offset > data_end_offset_) {
+    return Status::Corruption("Offset is out of file size");
+  }
+
+  const char* row_ptr = file_data_.data() + offset;
+  size_t bytes_for_key;
+  Status s = ReadKey(row_ptr, key, bytes_for_key);
+  uint32_t value_size;
+  const char* value_ptr = GetVarint32Ptr(row_ptr + bytes_for_key,
+                                         file_data_.data() + data_end_offset_,
+                                         &value_size);
+  if (value_ptr == nullptr) {
+    return Status::Corruption("Error reading value length.");
+  }
+  next_offset = offset + (value_ptr - row_ptr) + value_size;
+  if (next_offset > data_end_offset_) {
+    return Status::Corruption("Reach end of file when reading value");
+  }
+  *value = Slice(value_ptr, value_size);
+
+  return Status::OK();
+}
+
+Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
+                             void* arg,
+                             bool (*saver)(void*, const ParsedInternalKey&,
+                                           const Slice&, bool),
+                             void (*mark_key_may_exist)(void*)) {
+  // Check bloom filter first.
+  Slice prefix_slice = GetPrefix(target);
+  uint32_t prefix_hash = GetSliceHash(prefix_slice);
+  if (!MayHavePrefix(prefix_hash)) {
+    return Status::OK();
+  }
+  uint32_t offset;
+  bool prefix_match;
+  Status s = GetOffset(target, prefix_slice, prefix_hash, prefix_match, offset);
+  if (!s.ok()) {
+    return s;
+  }
+  ParsedInternalKey found_key;
+  ParsedInternalKey parsed_target;
+  if (!ParseInternalKey(target, &parsed_target)) {
+    return Status::Corruption(Slice());
+  }
+
+  Slice found_value;
+  while (offset < data_end_offset_) {
+    Status s = Next(offset, &found_key, &found_value, offset);
+    if (!s.ok()) {
+      return s;
+    }
+    if (!prefix_match) {
+      // Need to verify prefix for the first key found if it is not yet
+      // checked.
+      if (GetPrefix(found_key) != prefix_slice) {
+        return Status::OK();
+      }
+      prefix_match = true;
+    }
+    if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
+      if (!(*saver)(arg, found_key, found_value, true)) {
+        break;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
+  return 0;
+}
+
+PlainTableIterator::PlainTableIterator(PlainTableReader* table) :
+    table_(table) {
+  next_offset_ = offset_ = table_->data_end_offset_;
+}
+
+PlainTableIterator::~PlainTableIterator() {
+}
+
+bool PlainTableIterator::Valid() const {
+  return offset_ < table_->data_end_offset_
+      && offset_ >= table_->data_start_offset_;
+}
+
+void PlainTableIterator::SeekToFirst() {
+  next_offset_ = table_->data_start_offset_;
+  if (next_offset_ >= table_->data_end_offset_) {
+    next_offset_ = offset_ = table_->data_end_offset_;
+  } else {
+    Next();
+  }
+}
+
+void PlainTableIterator::SeekToLast() {
+  assert(false);
+}
+
+void PlainTableIterator::Seek(const Slice& target) {
+  Slice prefix_slice =  table_->GetPrefix(target);
+  uint32_t prefix_hash = GetSliceHash(prefix_slice);
+  if (!table_->MayHavePrefix(prefix_hash)) {
+    offset_ = next_offset_ = table_->data_end_offset_;
+    return;
+  }
+  bool prefix_match;
+  status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match,
+                              next_offset_);
+  if (!status_.ok()) {
+    offset_ = next_offset_ = table_->data_end_offset_;
+    return;
+  }
+
+  if (next_offset_ < table_-> data_end_offset_) {
+    for (Next(); status_.ok() && Valid(); Next()) {
+      if (!prefix_match) {
+        // Need to verify the first key's prefix
+        if (table_->GetPrefix(key()) != prefix_slice) {
+          offset_ = next_offset_ = table_->data_end_offset_;
+          break;
+        }
+        prefix_match = true;
+      }
+      if (table_->internal_comparator_.Compare(key(), target) >= 0) {
+        break;
+      }
+    }
+  } else {
+    offset_ = table_->data_end_offset_;
+  }
+}
+
+void PlainTableIterator::Next() {
+  offset_ = next_offset_;
+  if (offset_ < table_->data_end_offset_) {
+    Slice tmp_slice;
+    ParsedInternalKey parsed_key;
+    status_ = table_->Next(next_offset_, &parsed_key, &value_, next_offset_);
+    if (status_.ok()) {
+      // Make a copy in this case. TODO optimize.
+      tmp_str_.clear();
+      AppendInternalKey(&tmp_str_, parsed_key);
+      key_ = Slice(tmp_str_);
+    } else {
+      offset_ = next_offset_ = table_->data_end_offset_;
+    }
+  }
+}
+
+void PlainTableIterator::Prev() {
+  assert(false);
+}
+
+Slice PlainTableIterator::key() const {
+  assert(Valid());
+  return key_;
+}
+
+Slice PlainTableIterator::value() const {
+  assert(Valid());
+  return value_;
+}
+
+Status PlainTableIterator::status() const {
+  return status_;
+}
+
+}  // namespace rocksdb
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -0,0 +1,220 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <unordered_map>
+#include <memory>
+#include <vector>
+#include <string>
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_reader.h"
+#include "table/plain_table_factory.h"
+
+namespace rocksdb {
+
+class Block;
+class BlockHandle;
+class Footer;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class TableReader;
+class DynamicBloom;
+class InternalKeyComparator;
+
+using std::unique_ptr;
+using std::unordered_map;
+extern const uint32_t kPlainTableVariableLength;
+
+// Based on following output file format shown in plain_table_factory.h
+// When opening the output file, IndexedTableReader creates a hash table
+// from key prefixes to offset of the output file. IndexedTable will decide
+// whether it points to the data offset of the first key with the key prefix
+// or the offset of it. If there are too many keys share this prefix, it will
+// create a binary search-able index from the suffix to offset on disk.
+//
+// The implementation of IndexedTableReader requires output file is mmaped
+class PlainTableReader: public TableReader {
+ public:
+  static Status Open(const Options& options, const EnvOptions& soptions,
+                     const InternalKeyComparator& internal_comparator,
+                     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                     unique_ptr<TableReader>* table,
+                     const int bloom_bits_per_key, double hash_table_ratio);
+
+  bool PrefixMayMatch(const Slice& internal_prefix);
+
+  Iterator* NewIterator(const ReadOptions&);
+
+  Status Get(const ReadOptions&, const Slice& key, void* arg,
+             bool (*result_handler)(void* arg, const ParsedInternalKey& k,
+                                    const Slice& v, bool),
+             void (*mark_key_may_exist)(void*) = nullptr);
+
+  uint64_t ApproximateOffsetOf(const Slice& key);
+
+  void SetupForCompaction();
+
+  const TableProperties& GetTableProperties() { return table_properties_; }
+
+  PlainTableReader(const EnvOptions& storage_options,
+                   const InternalKeyComparator& internal_comparator,
+                   uint64_t file_size, int bloom_num_bits,
+                   double hash_table_ratio,
+                   const TableProperties& table_properties);
+  ~PlainTableReader();
+
+ private:
+  struct IndexRecord;
+  class IndexRecordList;
+
+  uint32_t* hash_table_ = nullptr;
+  int hash_table_size_ = 0;
+  char* sub_index_ = nullptr;
+
+  Options options_;
+  const EnvOptions& soptions_;
+  const InternalKeyComparator internal_comparator_;
+  Status status_;
+  unique_ptr<RandomAccessFile> file_;
+
+  Slice file_data_;
+  uint32_t version_;
+  uint32_t file_size_;
+
+  const double kHashTableRatio;
+  const int kBloomBitsPerKey;
+  DynamicBloom* bloom_ = nullptr;
+
+  TableProperties table_properties_;
+  const uint32_t data_start_offset_ = 0;
+  const uint32_t data_end_offset_;
+  const size_t user_key_len_;
+
+  static const size_t kNumInternalBytes = 8;
+  static const uint32_t kSubIndexMask = 0x80000000;
+  static const size_t kOffsetLen = sizeof(uint32_t);
+  static const uint64_t kMaxFileSize = 1u << 31;
+  static const size_t kRecordsPerGroup = 256;
+  // To speed up the search for keys with same prefix, we'll add index key for
+  // every N keys, where the "N" is determined by
+  // kIndexIntervalForSamePrefixKeys
+  static const size_t kIndexIntervalForSamePrefixKeys = 16;
+
+  bool IsFixedLength() const {
+    return user_key_len_ != kPlainTableVariableLength;
+  }
+
+  size_t GetFixedInternalKeyLength() const {
+    return user_key_len_ + kNumInternalBytes;
+  }
+
+  friend class TableCache;
+  friend class PlainTableIterator;
+
+  // Internal helper function to generate an IndexRecordList object from all
+  // the rows, which contains index records as a list.
+  int PopulateIndexRecordList(IndexRecordList* record_list);
+
+  // Internal helper function to allocate memory for indexes and bloom filters
+  void AllocateIndexAndBloom(int num_prefixes);
+
+  // Internal helper function to bucket index record list to hash buckets.
+  // hash_to_offsets is sized of of hash_table_size_, each contains a linked
+  // list
+  // of offsets for the hash, in reversed order.
+  // bucket_count is sized of hash_table_size_. The value is how many index
+  // records are there in hash_to_offsets for the same bucket.
+  size_t BucketizeIndexesAndFillBloom(
+      IndexRecordList& record_list, int num_prefixes,
+      std::vector<IndexRecord*>* hash_to_offsets,
+      std::vector<uint32_t>* bucket_count);
+
+  // Internal helper class to fill the indexes and bloom filters to internal
+  // data structures. hash_to_offsets and bucket_count are bucketized indexes
+  // and counts generated by BucketizeIndexesAndFillBloom().
+  void FillIndexes(size_t sub_index_size_needed,
+                   const std::vector<IndexRecord*>& hash_to_offsets,
+                   const std::vector<uint32_t>& bucket_count);
+
+  // PopulateIndex() builds index of keys. It must be called before any query
+  // to the table.
+  //
+  // hash_table_ contains buckets size of hash_table_size_, each is a 32-bit
+  // integer. The lower 31 bits contain an offset value (explained below) and
+  // the first bit of the integer indicates type of the offset.
+  //
+  // +--------------+------------------------------------------------------+
+  // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
+  // +--------------+------------------------------------------------------+
+  //
+  // Explanation for the "flag bit":
+  //
+  // 0 indicates that the bucket contains only one prefix (no conflict when
+  //   hashing this prefix), whose first row starts from this offset of the
+  // file.
+  // 1 indicates that the bucket contains more than one prefixes, or there
+  //   are too many rows for one prefix so we need a binary search for it. In
+  //   this case, the offset indicates the offset of sub_index_ holding the
+  //   binary search indexes of keys for those rows. Those binary search indexes
+  //   are organized in this way:
+  //
+  // The first 4 bytes, indicate how many indexes (N) are stored after it. After
+  // it, there are N 32-bit integers, each points of an offset of the file,
+  // which
+  // points to starting of a row. Those offsets need to be guaranteed to be in
+  // ascending order so the keys they are pointing to are also in ascending
+  // order
+  // to make sure we can use them to do binary searches. Below is visual
+  // presentation of a bucket.
+  //
+  // <begin>
+  //   number_of_records:  varint32
+  //   record 1 file offset:  fixedint32
+  //   record 2 file offset:  fixedint32
+  //    ....
+  //   record N file offset:  fixedint32
+  // <end>
+  Status PopulateIndex();
+
+  // Check bloom filter to see whether it might contain this prefix.
+  // The hash of the prefix is given, since it can be reused for index lookup
+  // too.
+  bool MayHavePrefix(uint32_t hash);
+
+  Status ReadKey(const char* row_ptr, ParsedInternalKey* key,
+                 size_t& bytes_read);
+  // Read the key and value at offset to key and value.
+  // tmp_slice is a tmp slice.
+  // return next_offset as the offset for the next key.
+  Status Next(uint32_t offset, ParsedInternalKey* key, Slice* value,
+              uint32_t& next_offset);
+  // Get file offset for key target.
+  // return value prefix_matched is set to true if the offset is confirmed
+  // for a key with the same prefix as target.
+  Status GetOffset(const Slice& target, const Slice& prefix,
+                   uint32_t prefix_hash, bool& prefix_matched,
+                   uint32_t& ret_offset);
+
+  Slice GetPrefix(const Slice& target) {
+    assert(target.size() >= 8);  // target is internal key
+    return options_.prefix_extractor->Transform(
+        Slice(target.data(), target.size() - 8));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target);
+
+  // No copying allowed
+  explicit PlainTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+}  // namespace rocksdb
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+namespace rocksdb {
+
+class Slice;
+class Status;
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+class TableBuilder {
+ public:
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  virtual ~TableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Add(const Slice& key, const Slice& value) = 0;
+
+  // Return non-ok iff some error has been detected.
+  virtual Status status() const = 0;
+
+  // Finish building the table.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual Status Finish() = 0;
+
+  // Indicate that the contents of this builder should be abandoned.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Abandon() = 0;
+
+  // Number of calls to Add() so far.
+  virtual uint64_t NumEntries() const = 0;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  virtual uint64_t FileSize() const = 0;
+};
+
+}  // namespace rocksdb
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -0,0 +1,114 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+namespace {
+  void AppendProperty(
+      std::string& props,
+      const std::string& key,
+      const std::string& value,
+      const std::string& prop_delim,
+      const std::string& kv_delim) {
+    props.append(key);
+    props.append(kv_delim);
+    props.append(value);
+    props.append(prop_delim);
+  }
+
+  template <class TValue>
+  void AppendProperty(
+      std::string& props,
+      const std::string& key,
+      const TValue& value,
+      const std::string& prop_delim,
+      const std::string& kv_delim) {
+    AppendProperty(
+        props, key, std::to_string(value), prop_delim, kv_delim
+    );
+  }
+}
+
+std::string TableProperties::ToString(
+    const std::string& prop_delim,
+    const std::string& kv_delim) const {
+  std::string result;
+  result.reserve(1024);
+
+  // Basic Info
+  AppendProperty(
+      result, "# data blocks", num_data_blocks, prop_delim, kv_delim
+  );
+  AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
+
+  AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
+  AppendProperty(
+      result,
+      "raw average key size",
+      num_entries != 0 ?  1.0 * raw_key_size / num_entries : 0.0,
+      prop_delim,
+      kv_delim
+  );
+  AppendProperty(
+      result, "raw value size", raw_value_size, prop_delim, kv_delim
+  );
+  AppendProperty(
+      result,
+      "raw average value size",
+      num_entries != 0 ?  1.0 * raw_value_size / num_entries : 0.0,
+      prop_delim,
+      kv_delim
+  );
+
+  AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
+  AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
+  AppendProperty(
+      result, "filter block size", filter_size, prop_delim, kv_delim
+  );
+  AppendProperty(
+      result,
+      "(estimated) table size",
+      data_size + index_size + filter_size,
+      prop_delim,
+      kv_delim
+  );
+
+  AppendProperty(
+      result,
+      "filter policy name",
+      filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
+      prop_delim,
+      kv_delim
+  );
+
+  return result;
+}
+
+const std::string TablePropertiesNames::kDataSize  =
+    "rocksdb.data.size";
+const std::string TablePropertiesNames::kIndexSize =
+    "rocksdb.index.size";
+const std::string TablePropertiesNames::kFilterSize =
+    "rocksdb.filter.size";
+const std::string TablePropertiesNames::kRawKeySize =
+    "rocksdb.raw.key.size";
+const std::string TablePropertiesNames::kRawValueSize =
+    "rocksdb.raw.value.size";
+const std::string TablePropertiesNames::kNumDataBlocks =
+    "rocksdb.num.data.blocks";
+const std::string TablePropertiesNames::kNumEntries =
+    "rocksdb.num.entries";
+const std::string TablePropertiesNames::kFilterPolicy =
+    "rocksdb.filter.policy";
+const std::string TablePropertiesNames::kFormatVersion =
+    "rocksdb.format.version";
+const std::string TablePropertiesNames::kFixedKeyLen =
+    "rocksdb.fixed.key.length";
+
+extern const std::string kPropertiesBlock = "rocksdb.properties";
+
+}  // namespace rocksdb
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -0,0 +1,71 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+namespace rocksdb {
+
+class Iterator;
+struct ParsedInternalKey;
+class Slice;
+struct ReadOptions;
+struct TableProperties;
+
+// A Table is a sorted map from strings to strings.  Tables are
+// immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization.
+class TableReader {
+ public:
+  virtual ~TableReader() {}
+
+  // Determine whether there is a chance that the current table file
+  // contains the key a key starting with iternal_prefix. The specific
+  // table implementation can use bloom filter and/or other heuristic
+  // to filter out this table as a whole.
+  virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  virtual Iterator* NewIterator(const ReadOptions&) = 0;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  virtual void SetupForCompaction() = 0;
+
+  virtual const TableProperties& GetTableProperties() = 0;
+
+  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
+  // the entry found after a call to Seek(key), until result_handler returns
+  // false, where k is the actual internal key for a row found and v as the
+  // value of the key. didIO is true if I/O is involved in the operation. May
+  // not make such a call if filter policy says that key is not present.
+  //
+  // mark_key_may_exist_handler needs to be called when it is configured to be
+  // memory only and the key is not found in the block cache, with
+  // the parameter to be handle_context.
+  //
+  // readOptions is the options for the read
+  // key is the key to search for
+  virtual Status Get(
+      const ReadOptions& readOptions, const Slice& key, void* handle_context,
+      bool (*result_handler)(void* arg, const ParsedInternalKey& k,
+                             const Slice& v, bool didIO),
+      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
+};
+
+}  // namespace rocksdb
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -6,12 +6,13 @@
 #include <gflags/gflags.h>

 #include "rocksdb/db.h"
-#include "rocksdb/table.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
 #include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "port/atomic_pointer.h"
 #include "table/block_based_table_factory.h"
+#include "table/plain_table_factory.h"
 #include "util/histogram.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -33,8 +34,8 @@ static std::string MakeKey(int i, int j, bool through_db) {
  return key.Encode().ToString();
 }

-static bool DummySaveValue(void* arg, const Slice& ikey, const Slice& v,
-                           bool didIO) {
+static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey,
+                           const Slice& v, bool didIO) {
  return false;
 }

@@ -70,7 +71,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
  Status s;
  if (!through_db) {
    env->NewWritableFile(file_name, &file, env_options);
-    tb = opts.table_factory->GetTableBuilder(opts, file.get(),
+    tb = opts.table_factory->NewTableBuilder(opts, file.get(),
                                             CompressionType::kNoCompression);
  } else {
    s = DB::Open(opts, dbname, &db);
@@ -101,7 +102,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
    Status s = env->NewRandomAccessFile(file_name, &raf, env_options);
    uint64_t file_size;
    env->GetFileSize(file_name, &file_size);
-    s = opts.table_factory->GetTableReader(opts, env_options, std::move(raf),
+    s = opts.table_factory->NewTableReader(opts, env_options, std::move(raf),
                                           file_size, &table_reader);
  }

@@ -218,6 +219,8 @@ DEFINE_bool(iterator, false, "For test iterator");
 DEFINE_bool(through_db, false, "If enable, a DB instance will be created and "
            "the query will be against DB. Otherwise, will be directly against "
            "a table reader.");
+DEFINE_bool(plain_table, false, "Use PlainTable");
+

 int main(int argc, char** argv) {
  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
@@ -230,10 +233,23 @@ int main(int argc, char** argv) {
    options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
        FLAGS_prefix_len);
  }
-  options.SetUpDefaultFlushBlockPolicyFactory();
  rocksdb::ReadOptions ro;
  rocksdb::EnvOptions env_options;
  options.create_if_missing = true;
+  options.compression = rocksdb::CompressionType::kNoCompression;
+  options.internal_comparator =
+      new rocksdb::InternalKeyComparator(options.comparator);
+
+  if (FLAGS_plain_table) {
+    options.allow_mmap_reads = true;
+    env_options.use_mmap_reads = true;
+    tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8,
+                                        0.75);
+    options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
+        FLAGS_prefix_len);
+  } else {
+    tf = new rocksdb::BlockBasedTableFactory();
+  }
  options.table_factory =
      std::shared_ptr<rocksdb::TableFactory>(tf);
  TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1,
--- a/table/table_test.cc
+++ b/table/table_test.cc
--- a/table/two_level_iterator.cc
+++ b/table/two_level_iterator.cc
@@ -20,18 +20,17 @@ namespace rocksdb {
 namespace {

 typedef Iterator* (*BlockFunction)(void*, const ReadOptions&,
-                                   const EnvOptions& soptions, const Slice&,
-                                   bool for_compaction);
+                                   const EnvOptions& soptions,
+                                   const InternalKeyComparator& icomparator,
+                                   const Slice&, bool for_compaction);

 class TwoLevelIterator: public Iterator {
 public:
-  TwoLevelIterator(
-    Iterator* index_iter,
-    BlockFunction block_function,
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
-    bool for_compaction);
+  TwoLevelIterator(Iterator* index_iter, BlockFunction block_function,
+                   void* arg, const ReadOptions& options,
+                   const EnvOptions& soptions,
+                   const InternalKeyComparator& internal_comparator,
+                   bool for_compaction);

  virtual ~TwoLevelIterator();

@@ -76,6 +75,7 @@ class TwoLevelIterator: public Iterator {
  void* arg_;
  const ReadOptions options_;
  const EnvOptions& soptions_;
+  const InternalKeyComparator& internal_comparator_;
  Status status_;
  IteratorWrapper index_iter_;
  IteratorWrapper data_iter_; // May be nullptr
@@ -86,20 +86,17 @@ class TwoLevelIterator: public Iterator {
 };

 TwoLevelIterator::TwoLevelIterator(
-    Iterator* index_iter,
-    BlockFunction block_function,
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
-    bool for_compaction)
+    Iterator* index_iter, BlockFunction block_function, void* arg,
+    const ReadOptions& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator, bool for_compaction)
    : block_function_(block_function),
      arg_(arg),
      options_(options),
      soptions_(soptions),
+      internal_comparator_(internal_comparator),
      index_iter_(index_iter),
      data_iter_(nullptr),
-      for_compaction_(for_compaction) {
-}
+      for_compaction_(for_compaction) {}

 TwoLevelIterator::~TwoLevelIterator() {
 }
@@ -181,8 +178,9 @@ void TwoLevelIterator::InitDataBlock() {
      // data_iter_ is already constructed with this iterator, so
      // no need to change anything
    } else {
-      Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle,
-                                          for_compaction_);
+      Iterator* iter =
+          (*block_function_)(arg_, options_, soptions_, internal_comparator_,
+                             handle, for_compaction_);
      data_block_handle_.assign(handle.data(), handle.size());
      SetDataIterator(iter);
    }
@@ -191,15 +189,14 @@ void TwoLevelIterator::InitDataBlock() {

 }  // namespace

-Iterator* NewTwoLevelIterator(
-    Iterator* index_iter,
-    BlockFunction block_function,
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
-    bool for_compaction) {
-  return new TwoLevelIterator(index_iter, block_function, arg,
-                              options, soptions, for_compaction);
+Iterator* NewTwoLevelIterator(Iterator* index_iter,
+                              BlockFunction block_function, void* arg,
+                              const ReadOptions& options,
+                              const EnvOptions& soptions,
+                              const InternalKeyComparator& internal_comparator,
+                              bool for_compaction) {
+  return new TwoLevelIterator(index_iter, block_function, arg, options,
+                              soptions, internal_comparator, for_compaction);
 }

 }  // namespace rocksdb
--- a/table/two_level_iterator.h
+++ b/table/two_level_iterator.h
@@ -14,6 +14,7 @@
 namespace rocksdb {

 struct ReadOptions;
+class InternalKeyComparator;

 // Return a new two level iterator.  A two-level iterator contains an
 // index iterator whose values point to a sequence of blocks where
@@ -27,14 +28,11 @@ struct ReadOptions;
 extern Iterator* NewTwoLevelIterator(
    Iterator* index_iter,
    Iterator* (*block_function)(
-        void* arg,
-        const ReadOptions& options,
-        const EnvOptions& soptions,
-        const Slice& index_value,
-        bool for_compaction),
-    void* arg,
-    const ReadOptions& options,
-    const EnvOptions& soptions,
+        void* arg, const ReadOptions& options, const EnvOptions& soptions,
+        const InternalKeyComparator& internal_comparator,
+        const Slice& index_value, bool for_compaction),
+    void* arg, const ReadOptions& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_comparator,
    bool for_compaction = false);

 }  // namespace rocksdb