Squashed 'src/rocksdb/' content from commit 457bae6

git-subtree-dir: src/rocksdb git-subtree-split: 457bae6911
2025-11-29 15:35:50 +00:00 · 2014-06-04 16:10:38 -07:00
commit 8514b88974
379 changed files with 108179 additions and 0 deletions
--- a/.arcconfig
+++ b/.arcconfig
@@ -0,0 +1,10 @@
+{
+  "project_id" : "rocksdb",
+  "conduit_uri" : "https://reviews.facebook.net/",
+  "copyright_holder" : "Facebook",
+  "load" : [
+    "linters"
+  ],
+  "lint.engine" : "FacebookFbcodeLintEngine",
+  "lint.engine.single.linter" : "FbcodeCppLinter"
+}
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,5 @@
+# Complete list of style options can be found at: 
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+---
+BasedOnStyle: Google
+...
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,34 @@
+TARGETS
+build_config.mk
+
+*.a
+*.arc
+*.d
+*.dylib*
+*.gcda
+*.gcno
+*.o
+*.so
+*.so.*
+*_test
+*_bench
+*_stress
+*.out
+*.class
+*.jar
+*.*jnilib*
+*.d-e
+*.o-*
+*.swp
+
+ldb
+manifest_dump
+sst_dump
+util/build_version.cc
+build_tools/VALGRIND_LOGS/
+coverage/COVERAGE_REPORT
+.gdbhistory
+.phutil_module_cache
+tags
+java/*.log
+java/include/org_rocksdb_*.h
--- a/.travis.yml
+++ b/.travis.yml
@@ -0,0 +1,20 @@
+language: cpp
+compiler: gcc
+before_install:
+# As of this writing (10 May 2014) the Travis build environment is Ubuntu 12.04,
+# which needs the following ugly dependency incantations to build RocksDB:
+ - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+ - sudo apt-get update -qq
+ - sudo apt-get install -y -qq gcc-4.8 g++-4.8 zlib1g-dev libbz2-dev libsnappy-dev libjemalloc-dev
+ - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 50
+ - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 50
+ - wget https://gflags.googlecode.com/files/libgflags0_2.0-1_amd64.deb
+ - sudo dpkg -i libgflags0_2.0-1_amd64.deb
+ - wget https://gflags.googlecode.com/files/libgflags-dev_2.0-1_amd64.deb
+ - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
+# Lousy hack to disable use and testing of fallocate, which doesn't behave quite
+# as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
+ - sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform
+script: make check -j8
+notifications:
+    email: false
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,20 @@
+# Contributing to RocksDB
+
+## Contributor License Agreement ("CLA")
+
+In order to accept your pull request, we need you to submit a CLA. You
+only need to do this once, so if you've done this for another Facebook
+open source project, you're good to go. If you are submitting a pull
+request for the first time, just let us know that you have completed
+the CLA and we can cross-check with your GitHub username.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+If you don't have a Facebook account, we can send you a PDF that you can
+sign offline. Send us an e-mail or create a new github issue to
+request the CLA in PDF format.
+
+## License
+
+By contributing to RocksDB, you agree that your contributions will be
+licensed under the [BSD License](LICENSE).
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -0,0 +1,89 @@
+# Rocksdb Change Log
+
+## 3.1.0 (05/21/2014)
+
+### Public API changes
+* Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories
+
+### New Features
+* Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open.
+* FIFO compaction style
+
+## 3.0.0 (05/05/2014)
+
+### Public API changes
+* Added _LEVEL to all InfoLogLevel enums
+* Deprecated ReadOptions.prefix and ReadOptions.prefix_seek. Seek() defaults to prefix-based seek when Options.prefix_extractor is supplied. More detail is documented in https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes
+* MemTableRepFactory::CreateMemTableRep() takes info logger as an extra parameter.
+
+### New Features
+* Column family support
+* Added an option to use different checksum functions in BlockBasedTableOptions
+* Added ApplyToAllCacheEntries() function to Cache
+
+## 2.8.0 (04/04/2014)
+
+* Removed arena.h from public header files.
+* By default, checksums are verified on every read from database
+* Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false
+* Added is_manual_compaction to CompactionFilter::Context
+* Added "virtual void WaitForJoin()" in class Env. Default operation is no-op.
+* Removed BackupEngine::DeleteBackupsNewerThan() function
+* Added new option -- verify_checksums_in_compaction
+* Changed Options.prefix_extractor from raw pointer to shared_ptr (take ownership)
+  Changed HashSkipListRepFactory and HashLinkListRepFactory constructor to not take SliceTransform object (use Options.prefix_extractor implicitly)
+* Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools
+* Added a command "checkconsistency" in ldb tool, which checks
+  if file system state matches DB state (file existence and file sizes)
+* Separate options related to block based table to a new struct BlockBasedTableOptions.
+* WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy
+* Add more counters to perf context.
+* Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table.
+
+### New Features
+* If we find one truncated record at the end of the MANIFEST or WAL files,
+  we will ignore it. We assume that writers of these records were interrupted
+  and that we can safely ignore it.
+* A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory().
+* A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory().
+* Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands.
+* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1.
+* Geo-spatial support for locations and radial-search.
+
+## 2.7.0 (01/28/2014)
+
+### Public API changes
+
+* Renamed `StackableDB::GetRawDB()` to `StackableDB::GetBaseDB()`.
+* Renamed `WriteBatch::Data()` `const std::string& Data() const`.
+* Renamed class `TableStats` to `TableProperties`.
+* Deleted class `PrefixHashRepFactory`. Please use `NewHashSkipListRepFactory()` instead.
+* Supported multi-threaded `EnableFileDeletions()` and `DisableFileDeletions()`.
+* Added `DB::GetOptions()`.
+* Added `DB::GetDbIdentity()`.
+
+### New Features
+
+* Added [BackupableDB](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F)
+* Implemented [TailingIterator](https://github.com/facebook/rocksdb/wiki/Tailing-Iterator), a special type of iterator that
+  doesn't create a snapshot (can be used to read newly inserted data)
+  and is optimized for doing sequential reads.
+* Added property block for table, which allows (1) a table to store
+  its metadata and (2) end user to collect and store properties they
+  are interested in.
+* Enabled caching index and filter block in block cache (turned off by default).
+* Supported error report when doing manual compaction.
+* Supported additional Linux platform flavors and Mac OS.
+* Put with `SliceParts` - Variant of `Put()` that gathers output like `writev(2)`
+* Bug fixes and code refactor for compatibility with upcoming Column
+  Family feature.
+
+### Performance Improvements
+
+* Huge benchmark performance improvements by multiple efforts. For example, increase in readonly QPS from about 530k in 2.6 release to 1.1 million in 2.7 [1]
+* Speeding up a way RocksDB deleted obsolete files - no longer listing the whole directory under a lock -- decrease in p99
+* Use raw pointer instead of shared pointer for statistics: [5b825d](https://github.com/facebook/rocksdb/commit/5b825d6964e26ec3b4bb6faa708ebb1787f1d7bd) -- huge increase in performance -- shared pointers are slow
+* Optimized locking for `Get()` -- [1fdb3f](https://github.com/facebook/rocksdb/commit/1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c) -- 1.5x QPS increase for some workloads
+* Cache speedup - [e8d40c3](https://github.com/facebook/rocksdb/commit/e8d40c31b3cca0c3e1ae9abe9b9003b1288026a9)
+* Implemented autovector, which allocates first N elements on stack. Most of vectors in RocksDB are small. Also, we never want to allocate heap objects while holding a mutex. -- [c01676e4](https://github.com/facebook/rocksdb/commit/c01676e46d3be08c3c140361ef1f5884f47d3b3c)
+* Lots of efforts to move malloc, memcpy and IO outside of locks
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -0,0 +1,84 @@
+## Compilation
+
+RocksDB's library should be able to compile without any dependency installed,
+although we recommend installing some compression libraries (see below).
+We do depend on newer gcc with C++11 support.
+
+There are few options when compiling RocksDB:
+
+* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library.
+
+* `make shared_lib` will compile librocksdb.so, RocksDB shared library.
+
+* `make check` will compile and run all the unit tests
+
+* `make all` will compile our static library, and all our tools and unit tests. Our tools
+depend on gflags. You will need to have gflags installed to run `make all`.
+
+## Dependencies
+
+* You can link RocksDB with following compression libraries:
+  - [zlib](http://www.zlib.net/) - a library for data compression.
+  - [bzip2](http://www.bzip.org/) - a library for data compression.
+  - [snappy](https://code.google.com/p/snappy/) - a library for fast
+      data compression.
+
+* All our tools depend on:
+  - [gflags](https://code.google.com/p/gflags/) - a library that handles
+      command line flags processing. You can compile rocksdb library even
+      if you don't have gflags installed.
+
+## Supported platforms
+
+* **Linux - Ubuntu**
+    * Upgrade your gcc to version at least 4.7 to get C++11 support.
+    * Install gflags. First, try: `sudo apt-get install libgflags-dev`
+      If this doesn't work and you're using Ubuntu, here's a nice tutorial:
+      (http://askubuntu.com/questions/312173/installing-gflags-12-04)
+    * Install snappy. This is usually as easy as:
+      `sudo apt-get install libsnappy-dev`.
+    * Install zlib. Try: `sudo apt-get install zlib1g-dev`.
+    * Install bzip2: `sudo apt-get install libbz2-dev`.
+* **Linux - CentOS**
+    * Upgrade your gcc to version at least 4.7 to get C++11 support:
+      `yum install gcc47-c++`
+    * Install gflags:
+
+              wget https://gflags.googlecode.com/files/gflags-2.0-no-svn-files.tar.gz
+              tar -xzvf gflags-2.0-no-svn-files.tar.gz
+              cd gflags-2.0
+              ./configure && make && sudo make install
+
+    * Install snappy:
+
+              wget https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz
+              tar -xzvf snappy-1.1.1.tar.gz
+              cd snappy-1.1.1
+              ./configure && make && sudo make install
+
+    * Install zlib:
+
+              sudo yum install zlib
+              sudo yum install zlib-devel
+
+    * Install bzip2:
+
+              sudo yum install bzip2
+              sudo yum install bzip2-devel
+
+* **OS X**:
+    * Install latest C++ compiler that supports C++ 11:
+        * Update XCode:  run `xcode-select --install` (or install it from XCode App's settting).
+        * Install via [homebrew](http://brew.sh/).
+            * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
+            * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
+    * Install zlib, bzip2 and snappy libraries for compression.
+    * Install gflags. We have included a script
+    `build_tools/mac-install-gflags.sh`, which should automatically install it.
+    If you installed gflags by other means (for example, `brew install gflags`),
+    please set `LIBRARY_PATH` and `CPATH` accordingly.
+    * Please note that some of the optimizations/features are disabled in OSX.
+    We did not run any production workloads on it.
+
+* **iOS**:
+  * Run: `TARGET_OS=IOS make static_lib`
--- a/BIN
+++ b/BIN
--- a/523
+++ b/523
@@ -0,0 +1,523 @@
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+# Inherit some settings from environment variables, if available
+INSTALL_PATH ?= $(CURDIR)
+
+#-----------------------------------------------
+
+ifneq ($(MAKECMDGOALS),dbg)
+OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
+else
+# intentionally left blank
+endif
+
+ifeq ($(MAKECMDGOALS),shared_lib)
+OPT += -DNDEBUG
+endif
+
+ifeq ($(MAKECMDGOALS),static_lib)
+OPT += -DNDEBUG
+endif
+
+#-----------------------------------------------
+
+# detect what platform we're building on
+$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/build_config.mk"))
+# this file is generated by the previous line to set build flags and sources
+include build_config.mk
+
+ifneq ($(PLATFORM), IOS)
+CFLAGS += -g
+CXXFLAGS += -g
+else
+# no debug info for IOS, that will make our library big
+OPT += -DNDEBUG
+endif
+
+# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
+ifdef COMPILE_WITH_ASAN
+	# ASAN compile flags
+	EXEC_LDFLAGS += -fsanitize=address
+	PLATFORM_CCFLAGS += -fsanitize=address
+	PLATFORM_CXXFLAGS += -fsanitize=address
+else
+	# if we're not compiling with ASAN, use jemalloc
+	EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
+	PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
+	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
+endif
+
+WARNING_FLAGS = -Wall -Werror
+CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
+
+LDFLAGS += $(PLATFORM_LDFLAGS)
+
+LIBOBJECTS = $(SOURCES:.cc=.o)
+LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
+MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
+
+TESTUTIL = ./util/testutil.o
+TESTHARNESS = ./util/testharness.o $(TESTUTIL)
+BENCHHARNESS = ./util/benchharness.o
+VALGRIND_ERROR = 2
+VALGRIND_DIR = build_tools/VALGRIND_LOGS
+VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
+VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
+
+TESTS = \
+	db_test \
+	block_hash_index_test \
+	autovector_test \
+	column_family_test \
+	table_properties_collector_test \
+	arena_test \
+	auto_roll_logger_test \
+	benchharness_test \
+	block_test \
+	bloom_test \
+	dynamic_bloom_test \
+	c_test \
+	cache_test \
+	coding_test \
+	corruption_test \
+	crc32c_test \
+	dbformat_test \
+	env_test \
+	blob_store_test \
+	filelock_test \
+	filename_test \
+	filter_block_test \
+	histogram_test \
+	log_test \
+	manual_compaction_test \
+	memenv_test \
+	merge_test \
+	redis_test \
+	reduce_levels_test \
+	plain_table_db_test \
+	prefix_test \
+	simple_table_db_test \
+	skiplist_test \
+	stringappend_test \
+	ttl_test \
+	backupable_db_test \
+	version_edit_test \
+	version_set_test \
+	file_indexer_test \
+	write_batch_test\
+	deletefile_test \
+	table_test \
+	thread_local_test \
+        geodb_test
+
+TOOLS = \
+        sst_dump \
+	db_sanity_test \
+        db_stress \
+        ldb \
+	db_repl_stress \
+	blob_store_bench
+
+PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS)
+
+# The library name is configurable since we are maintaining libraries of both
+# debug/release mode.
+ifeq ($(LIBNAME),)
+        LIBNAME=librocksdb
+endif
+LIBRARY = ${LIBNAME}.a
+MEMENVLIBRARY = libmemenv.a
+
+default: all
+
+#-----------------------------------------------
+# Create platform independent shared libraries.
+#-----------------------------------------------
+ifneq ($(PLATFORM_SHARED_EXT),)
+
+ifneq ($(PLATFORM_SHARED_VERSIONED),true)
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1)
+SHARED3 = $(SHARED1)
+SHARED = $(SHARED1)
+else
+# Update db.h if you change these.
+SHARED_MAJOR = 3
+SHARED_MINOR = 2
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1).$(SHARED_MAJOR)
+SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
+SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
+$(SHARED1): $(SHARED3)
+	ln -fs $(SHARED3) $(SHARED1)
+$(SHARED2): $(SHARED3)
+	ln -fs $(SHARED3) $(SHARED2)
+endif
+
+$(SHARED3):
+	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) $(LDFLAGS) -o $@
+
+endif  # PLATFORM_SHARED_EXT
+
+.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
+	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
+	dbg
+
+all: $(LIBRARY) $(PROGRAMS) $(TESTS)
+
+static_lib: $(LIBRARY)
+
+shared_lib: $(SHARED)
+
+dbg: $(LIBRARY) $(PROGRAMS) $(TESTS)
+
+# creates static library and programs
+release:
+	$(MAKE) clean
+	OPT="-DNDEBUG -O2" $(MAKE) static_lib $(PROGRAMS) -j32
+
+coverage:
+	$(MAKE) clean
+	COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check -j32
+	(cd coverage; ./coverage_test.sh)
+	# Delete intermediate files
+	find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+
+check: $(TESTS) ldb
+	for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
+	python tools/ldb_test.py
+
+ldb_tests: ldb
+	python tools/ldb_test.py
+
+crash_test: whitebox_crash_test blackbox_crash_test
+
+blackbox_crash_test: db_stress
+	python -u tools/db_crashtest.py
+
+whitebox_crash_test: db_stress
+	python -u tools/db_crashtest2.py
+
+asan_check:
+	$(MAKE) clean
+	COMPILE_WITH_ASAN=1 $(MAKE) check -j32
+	$(MAKE) clean
+
+asan_crash_test:
+	$(MAKE) clean
+	COMPILE_WITH_ASAN=1 $(MAKE) crash_test
+	$(MAKE) clean
+
+valgrind_check: all $(PROGRAMS) $(TESTS)
+	mkdir -p $(VALGRIND_DIR)
+	echo TESTS THAT HAVE VALGRIND ERRORS > $(VALGRIND_DIR)/valgrind_failed_tests; \
+	echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \
+	for t in $(filter-out skiplist_test,$(TESTS)); do \
+		stime=`date '+%s'`; \
+		$(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
+		if [ $$? -eq $(VALGRIND_ERROR) ] ; then \
+			echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \
+		fi; \
+		etime=`date '+%s'`; \
+		echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
+	done
+
+clean:
+	-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk
+	-rm -rf ios-x86/* ios-arm/*
+	-find . -name "*.[od]" -exec rm {} \;
+	-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+tags:
+	ctags * -R
+	cscope -b `find . -name '*.cc'` `find . -name '*.h'`
+
+format:
+	build_tools/format-diff.sh
+
+# ---------------------------------------------------------------------------
+# 	Unit tests and tools
+# ---------------------------------------------------------------------------
+$(LIBRARY): $(LIBOBJECTS)
+	rm -f $@
+	$(AR) -rs $@ $(LIBOBJECTS)
+
+db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	 $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+signal_test: util/signal_test.o $(LIBOBJECTS)
+	$(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
+	$(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+benchharness_test: util/benchharness_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS)
+	$(CXX) util/benchharness_test.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
+
+plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
+
+log_and_apply_bench: db/log_and_apply_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS)
+	$(CXX) db/log_and_apply_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(BENCHHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
+
+perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+geodb_test: utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
+	rm -f $@
+	$(AR) -rs $@ $(MEMENVOBJECTS)
+
+memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS)
+	$(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY)
+	$(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+sst_dump: tools/sst_dump.o $(LIBOBJECTS)
+	$(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+ldb: tools/ldb.o $(LIBOBJECTS)
+	$(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+# ---------------------------------------------------------------------------
+# Jni stuff
+# ---------------------------------------------------------------------------
+
+JNI_NATIVE_SOURCES = ./java/rocksjni/*.cc
+JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
+ROCKSDBJNILIB = ./java/librocksdbjni.so
+
+ifeq ($(PLATFORM), OS_MACOSX)
+ROCKSDBJNILIB = ./java/librocksdbjni.jnilib
+JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
+endif
+
+rocksdbjava: clean
+	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
+	cd java;$(MAKE) java;
+	rm -f $(ROCKSDBJNILIB)
+	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o $(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(LDFLAGS) $(COVERAGEFLAGS)
+
+jclean:
+	cd java;$(MAKE) clean;
+	rm -f $(ROCKSDBJNILIB)
+
+jtest:
+	cd java;$(MAKE) sample;$(MAKE) test;
+
+jdb_bench:
+	cd java;$(MAKE) db_bench;
+
+# ---------------------------------------------------------------------------
+#  	Platform-specific compilation
+# ---------------------------------------------------------------------------
+
+ifeq ($(PLATFORM), IOS)
+# For iOS, create universal object files to be used on both the simulator and
+# a device.
+PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
+SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
+DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
+IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
+
+.cc.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+.c.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+else
+.cc.o:
+	$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
+
+.c.o:
+	$(CC) $(CFLAGS) -c $< -o $@
+endif
+
+# ---------------------------------------------------------------------------
+#  	Source files dependencies detection
+# ---------------------------------------------------------------------------
+
+# Add proper dependency support so changing a .h file forces a .cc file to
+# rebuild.
+
+# The .d file indicates .cc file's dependencies on .h files. We generate such
+# dependency by g++'s -MM option, whose output is a make dependency rule.
+# The sed command makes sure the "target" file in the generated .d file has
+# the correct path prefix.
+%.d: %.cc
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@
+ifeq ($(PLATFORM), OS_MACOSX)
+	@sed -i '' -e 's,.*:,$*.o:,' $@
+else
+	@sed -i -e 's,.*:,$*.o:,' $@
+endif
+
+DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
+
+depend: $(DEPFILES)
+
+# if the make goal is either "clean" or "format", we shouldn't
+# try to import the *.d files.
+# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
+# working solution.
+ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),format)
+ifneq ($(MAKECMDGOALS),jclean)
+ifneq ($(MAKECMDGOALS),jtest)
+-include $(DEPFILES)
+endif
+endif
+endif
+endif
--- a/23
+++ b/23
@@ -0,0 +1,23 @@
+Additional Grant of Patent Rights
+
+“Software” means the rocksdb software distributed by Facebook, Inc.
+
+Facebook hereby grants you a perpetual, worldwide, royalty-free,
+non-exclusive, irrevocable (subject to the termination provision below)
+license under any rights in any patent claims owned by Facebook, to make,
+have made, use, sell, offer to sell, import, and otherwise transfer the
+Software. For avoidance of doubt, no license is granted under Facebook’s
+rights in any patent claims that are infringed by (i) modifications to the
+Software made by you or a third party, or (ii) the Software in combination
+with any software or other technology provided by you or a third party.
+
+The license granted hereunder will terminate, automatically and without
+notice, for anyone that makes any claim (including by filing any lawsuit,
+assertion or other action) alleging (a) direct, indirect, or contributory
+infringement or inducement to infringe any patent: (i) by Facebook or any
+of its subsidiaries or affiliates, whether or not such claim is related
+to the Software, (ii) by any party if such claim arises in whole or in
+part from any software, product or service of Facebook or any of its
+subsidiaries or affiliates, whether or not such claim is related to the
+Software, or (iii) by any party relating to the Software; or (b) that
+any right in any patent claim of Facebook is invalid or unenforceable.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,26 @@
+## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
+
+[![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)
+
+RocksDB is developed and maintained by Facebook Database Engineering Team.
+It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
+and Jeff Dean (jeff@google.com)
+
+This code is a library that forms the core building block for a fast
+key value server, especially suited for storing data on flash drives.
+It has an Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
+between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF)
+and Space-Amplification-Factor (SAF). It has multi-threaded compactions,
+making it specially suitable for storing multiple terabytes of data in a
+single database.
+
+Start with example usage here: https://github.com/facebook/rocksdb/tree/master/examples
+
+See [doc/index.html](https://github.com/facebook/rocksdb/blob/master/doc/index.html) and
+[github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation.
+
+The public interface is in `include/`.  Callers should not include or
+rely on the details of any other header files in this package.  Those
+internal APIs may be changed without warning.
+
+Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/
--- a/ROCKSDB_LITE.md
+++ b/ROCKSDB_LITE.md
@@ -0,0 +1,20 @@
+# RocksDBLite
+
+RocksDBLite is a project focused on mobile use cases, which don't need a lot of fancy things we've built for server workloads and they are very sensitive to binary size. For that reason, we added a compile flag ROCKSDB_LITE that comments out a lot of the nonessential code and keeps the binary lean.
+
+Some examples of the features disabled by ROCKSDB_LITE:
+* compiled-in support for LDB tool
+* No backupable DB
+* No support for replication (which we provide in form of TrasactionalIterator)
+* No advanced monitoring tools
+* No special-purpose memtables that are highly optimized for specific use cases
+
+When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if:
+* Nobody from mobile really needs your feature,
+* Your feature is adding a lot of weight to the binary.
+
+Don't add ROCKSDB_LITE compile guard if:
+* It would introduce a lot of code complexity. Compile guards make code harder to read. It's a trade-off.
+* Your feature is not adding a lot of weight.
+
+If unsure, ask. :)
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -0,0 +1,320 @@
+#!/bin/sh
+#
+# Detects OS we're compiling on and outputs a file specified by the first
+# argument, which in turn gets read while processing Makefile.
+#
+# The output will set the following variables:
+#   CC                          C Compiler path
+#   CXX                         C++ Compiler path
+#   PLATFORM_LDFLAGS            Linker flags
+#   PLATFORM_SHARED_EXT         Extension for shared libraries
+#   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
+#   PLATFORM_SHARED_CFLAGS      Flags for compiling objects for shared library
+#   PLATFORM_CCFLAGS            C compiler flags
+#   PLATFORM_CXXFLAGS           C++ compiler flags.  Will contain:
+#   PLATFORM_SHARED_VERSIONED   Set to 'true' if platform supports versioned
+#                               shared libraries, empty otherwise.
+#
+# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
+#
+#       -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
+#       -DLEVELDB_PLATFORM_NOATOMIC if it is not
+#       -DSNAPPY                    if the Snappy library is present
+#       -DLZ4                       if the LZ4 library is present
+#
+# Using gflags in rocksdb:
+# Our project depends on gflags, which requires users to take some extra steps
+# before they can compile the whole repository:
+#   1. Install gflags. You may download it from here:
+#      https://code.google.com/p/gflags/
+#   2. Once install, add the include path/lib path for gflags to CPATH and
+#      LIBRARY_PATH respectively. If installed with default mode, the
+#      lib and include path will be /usr/local/lib and /usr/local/include
+# Mac user can do this by running build_tools/mac-install-gflags.sh
+
+OUTPUT=$1
+if test -z "$OUTPUT"; then
+  echo "usage: $0 <output-filename>" >&2
+  exit 1
+fi
+
+# we depend on C++11
+PLATFORM_CXXFLAGS="-std=c++11"
+# we currently depend on POSIX platform
+COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
+
+# Default to fbcode gcc on internal fb machines
+if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+    FBCODE_BUILD="true"
+    if [ -z "$USE_CLANG" ]; then
+        CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
+          $(rpm -q --whatprovides redhat-release)`
+        if [ "$CENTOS_VERSION" = "6" ]; then
+          source "$PWD/build_tools/fbcode.gcc481.sh"
+        else
+          source "$PWD/build_tools/fbcode.gcc471.sh"
+        fi
+    else
+        source "$PWD/build_tools/fbcode.clang31.sh"
+    fi
+fi
+
+# Delete existing output, if it exists
+rm -f "$OUTPUT"
+touch "$OUTPUT"
+
+if test -z "$CC"; then
+   CC=cc
+fi
+
+if test -z "$CXX"; then
+    CXX=g++
+fi
+
+# Detect OS
+if test -z "$TARGET_OS"; then
+    TARGET_OS=`uname -s`
+fi
+
+COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
+CROSS_COMPILE=
+PLATFORM_CCFLAGS=
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
+PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
+PLATFORM_SHARED_EXT="so"
+PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
+PLATFORM_SHARED_CFLAGS="-fPIC"
+PLATFORM_SHARED_VERSIONED=false
+
+# generic port files (working on all platform by #ifdef) go directly in /port
+GENERIC_PORT_FILES=`cd "$ROCKSDB_ROOT"; find port -name '*.cc' | tr "\n" " "`
+
+# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
+case "$TARGET_OS" in
+    Darwin)
+        PLATFORM=OS_MACOSX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
+        PLATFORM_SHARED_EXT=dylib
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
+        # PORT_FILES=port/darwin/darwin_specific.cc
+        ;;
+    IOS)
+        PLATFORM=IOS
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE"
+        PLATFORM_SHARED_EXT=dylib
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
+        CROSS_COMPILE=true
+        ;;
+    Linux)
+        PLATFORM=OS_LINUX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
+        if [ -z "$USE_CLANG" ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        fi
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/linux/linux_specific.cc
+        ;;
+    SunOS)
+        PLATFORM=OS_SOLARIS
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/sunos/sunos_specific.cc
+        ;;
+    FreeBSD)
+        PLATFORM=OS_FREEBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/freebsd/freebsd_specific.cc
+        ;;
+    NetBSD)
+        PLATFORM=OS_NETBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
+        # PORT_FILES=port/netbsd/netbsd_specific.cc
+        ;;
+    OpenBSD)
+        PLATFORM=OS_OPENBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
+        # PORT_FILES=port/openbsd/openbsd_specific.cc
+        ;;
+    DragonFly)
+        PLATFORM=OS_DRAGONFLYBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/dragonfly/dragonfly_specific.cc
+        ;;
+    OS_ANDROID_CROSSCOMPILE)
+        PLATFORM=OS_ANDROID
+	COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
+	PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS "  # All pthread features are in the Android C library
+        # PORT_FILES=port/android/android.cc
+        CROSS_COMPILE=true
+        ;;
+    *)
+        echo "Unknown platform!" >&2
+        exit 1
+esac
+
+if test -z "$DO_NOT_RUN_BUILD_DETECT_VERSION"; then
+  "$PWD/build_tools/build_detect_version"
+fi
+
+# We want to make a list of all cc files within util, db, table, and helpers
+# except for the test and benchmark files. By default, find will output a list
+# of all files matching either rule, so we need to append -print to make the
+# prune take effect.
+DIRS="util db table utilities"
+
+set -f # temporarily disable globbing so that our patterns arent expanded
+PRUNE_TEST="-name *test*.cc -prune"
+PRUNE_BENCH="-name *bench*.cc -prune"
+PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
+PORTABLE_CPP=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
+set +f # re-enable globbing
+
+# The sources consist of the portable files, plus the platform-specific port
+# file.
+echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> "$OUTPUT"
+echo "SOURCESCPP=$PORTABLE_CPP" >> "$OUTPUT"
+echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> "$OUTPUT"
+
+if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
+    # Cross-compiling; do not try any compilation tests.
+    # Also don't need any compilation tests if compiling on fbcode
+    true
+else
+    # If -std=c++0x works, use <atomic>.  Otherwise use port_posix.h.
+    $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <atomic>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT"
+    fi
+
+    # Test whether fallocate is available
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <fcntl.h>
+      int main() {
+	int fd = open("/dev/null", 0);
+	fallocate(fd, 0, 0, 1024);
+      }
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT"
+    fi
+
+    # Test whether Snappy library is installed
+    # http://code.google.com/p/snappy/
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <snappy.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
+    fi
+
+
+    # Test whether gflags library is installed
+    # http://code.google.com/p/gflags/
+    # check if the namespace is gflags
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <gflags/gflags.h>
+      using namespace gflags;
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=gflags"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+    fi
+
+    # check if namespace is google
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <gflags/gflags.h>
+      using namespace google;
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=google"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+    fi
+
+    # Test whether zlib library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <zlib.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
+    fi
+
+    # Test whether bzip library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <bzlib.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
+    fi
+
+    # Test whether lz4 library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <lz4.h>
+      #include <lz4hc.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DLZ4"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
+    fi
+
+    # Test whether tcmalloc is available
+    $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null  <<EOF
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
+    fi
+fi
+
+# shall we use HDFS?
+
+if test "$USE_HDFS"; then
+  if test -z "$JAVA_HOME"; then
+    echo "JAVA_HOME has to be set for HDFS usage."
+    exit 1
+  fi
+  HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -Wl,--no-whole-archive -lhdfs -L$JAVA_HOME/jre/lib/amd64"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
+  COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
+  PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $HDFS_LDFLAGS"
+fi
+
+# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
+COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
+
+PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
+
+VALGRIND_VER="$VALGRIND_VER"
+
+echo "CC=$CC" >> "$OUTPUT"
+echo "CXX=$CXX" >> "$OUTPUT"
+echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
+echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT"
+echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT"
+echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT"
+echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT"
+echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> "$OUTPUT"
+echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> "$OUTPUT"
+echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> "$OUTPUT"
+echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT"
+echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT"
+echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT"
+echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT"
--- a/build_tools/build_detect_version
+++ b/build_tools/build_detect_version
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+# Record the version of the source that we are compiling.
+# We keep a record of the git revision in util/version.cc. This source file
+# is then built as a regular source file as part of the compilation process.
+# One can run "strings executable_filename | grep _build_" to find the version of
+# the source that we used to build the executable file.
+
+OUTFILE="$PWD/util/build_version.cc"
+
+GIT_SHA=""
+if command -v git >/dev/null 2>&1; then
+    GIT_SHA=$(git rev-parse HEAD 2>/dev/null)
+fi
+
+cat > "${OUTFILE}" <<EOF
+#include "build_version.h"
+const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:${GIT_SHA}";
+const char* rocksdb_build_git_datetime = "rocksdb_build_git_datetime:$(date)";
+const char* rocksdb_build_compile_date = __DATE__;
+const char* rocksdb_build_compile_time = __TIME__;
+EOF
--- a/build_tools/fbcode.clang31.sh
+++ b/build_tools/fbcode.clang31.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile leveldb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
+TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
+TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
+GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
+
+# location of libgcc
+LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
+LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
+
+# location of glibc
+GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
+GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
+ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
+BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang $CLANG_INCLUDES"
+CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib "
+CFLAGS+=" -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.2/0b7c69d/lib/clang/3.2/include "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux "
+CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include "
+CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CXXFLAGS="$CFLAGS -nostdinc++"
+
+CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
+
+EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
+EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED 
--- a/build_tools/fbcode.gcc471.sh
+++ b/build_tools/fbcode.gcc471.sh
@@ -0,0 +1,70 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile leveldb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
+TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
+TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
+
+# location of libhdfs libraries
+if test "$USE_HDFS"; then
+  JAVA_HOME="/usr/local/jdk-6u22-64"
+  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
+  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
+  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
+  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
+  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
+fi
+
+# location of libgcc
+LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
+LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
+
+# location of glibc
+GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
+GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
+ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
+BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
+CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
+CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2"
+
+EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
+
+VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@@ -0,0 +1,81 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc
+CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)`
+if [ "$CENTOS_VERSION" = "6" ]; then
+  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native"
+else
+  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+fi
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
+
+# location of libhdfs libraries
+if test "$USE_HDFS"; then
+  JAVA_HOME="/usr/local/jdk-6u22-64"
+  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
+  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
+  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
+  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
+  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
+fi
+
+# location of libgcc
+LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include"
+LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs"
+
+# location of glibc
+GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include"
+GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include"
+ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include"
+BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a"
+
+LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
+LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
+LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
+JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
+CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
+CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4"
+
+EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
+
+VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+if [ -z $CLANG_FORMAT_DIFF ]
+then
+CLANG_FORMAT_DIFF="clang-format-diff.py"
+fi
+
+# Check clang-format-diff.py
+if ! which $CLANG_FORMAT_DIFF &> /dev/null
+then
+  echo "You didn't have clang-format-diff.py available in your computer!"
+  echo "You can download it by running: "
+  echo "    curl http://goo.gl/iUW1u2"
+  exit 128
+fi
+
+# Check argparse, a library that clang-format-diff.py requires.
+python 2>/dev/null << EOF
+import argparse
+EOF
+
+if [ "$?" != 0 ]
+then
+  echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
+  echo "installed. You can try either of the follow ways to install it:"
+  echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
+  echo "  2. easy_install argparse (if you have easy_install)"
+  echo "  3. pip install argparse (if you have pip)"
+  exit 129
+fi
+
+# TODO(kailiu) following work is not complete since we still need to figure
+# out how to add the modified files done pre-commit hook to git's commit index.
+#
+# Check if this script has already been added to pre-commit hook.
+# Will suggest user to add this script to pre-commit hook if their pre-commit
+# is empty.
+# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
+# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
+# then
+#   echo "Would you like to add this script to pre-commit hook, which will do "
+#   echo -n "the format check for all the affected lines before you check in (y/n):"
+#   read add_to_hook
+#   if [ "$add_to_hook" == "y" ]
+#   then
+#     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
+#   fi
+# fi
+set -e
+
+uncommitted_code=`git diff HEAD`
+
+# If there's no uncommitted changes, we assume user are doing post-commit
+# format check, in which case we'll check the modified lines from latest commit.
+# Otherwise, we'll check format of the uncommitted code only.
+if [ -z "$uncommitted_code" ]
+then
+  # Check the format of last commit
+  diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
+else
+  # Check the format of uncommitted lines,
+  diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
+fi
+
+if [ -z "$diffs" ]
+then
+  echo "Nothing needs to be reformatted!"
+  exit 0
+fi
+
+# Highlight the insertion/deletion from the clang-format-diff.py's output
+COLOR_END="\033[0m"
+COLOR_RED="\033[0;31m" 
+COLOR_GREEN="\033[0;32m" 
+
+echo -e "Detect lines that doesn't follow the format rules:\r"
+# Add the color to the diff. lines added will be green; lines removed will be red.
+echo "$diffs" | 
+  sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
+  sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
+echo -e "Would you like to fix the format automatically (y/n): \c"
+
+# Make sure under any mode, we can read user input.
+exec < /dev/tty
+read to_fix
+
+if [ "$to_fix" != "y" ]
+then
+  exit 1
+fi
+
+# Do in-place format adjustment.
+git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
+echo "Files reformatted!"
+
+# Amend to last commit if user do the post-commit format check
+if [ -z "$uncommitted_code" ]; then
+  echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
+  read to_amend
+
+  if [ "$to_amend" == "y" ]
+  then
+    git commit -a --amend --reuse-message HEAD
+    echo "Amended to last commit"
+  fi
+fi
--- a/build_tools/mac-install-gflags.sh
+++ b/build_tools/mac-install-gflags.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+# Install gflags for mac developers.
+
+set -e
+
+DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX`
+
+cd $DIR
+wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz
+tar xvfz gflags-2.0.tar.gz
+cd gflags-2.0
+
+./configure
+make
+make install
+
+# Add include/lib path for g++
+echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile
+echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile
+
+echo ""
+echo "-----------------------------------------------------------------------------"
+echo "|                         Installation Completed                            |"
+echo "-----------------------------------------------------------------------------"
+echo "Please run `. ~/bash_profile` to be able to compile with gflags"
--- a/build_tools/make_new_version.sh
+++ b/build_tools/make_new_version.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under the BSD-style license found in the
+#  LICENSE file in the root directory of this source tree. An additional grant
+#  of patent rights can be found in the PATENTS file in the same directory.
+
+set -e
+if [ -z "$GIT" ]
+then
+  GIT="git"
+fi
+
+# Print out the colored progress info so that it can be brainlessly 
+# distinguished by users.
+function title() {
+  echo -e "\033[1;32m$*\033[0m"
+}
+
+usage="Create new RocksDB version and prepare it for the release process\n"
+usage+="USAGE: ./make_new_version.sh <version>"
+
+# -- Pre-check
+if [[ $# < 1 ]]; then
+  echo -e $usage
+  exit 1
+fi
+
+ROCKSDB_VERSION=$1
+
+GIT_BRANCH=`git rev-parse --abbrev-ref HEAD`
+echo $GIT_BRANCH
+
+if [ $GIT_BRANCH != "master" ]; then
+  echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch."
+  exit 1
+fi
+
+title "Adding new tag for this release ..."
+BRANCH="$ROCKSDB_VERSION.fb"
+$GIT co -b $BRANCH
+
+# Setting up the proxy for remote repo access
+title "Pushing new branch to remote repo ..."
+git push origin --set-upstream $BRANCH
+
+title "Branch $BRANCH is pushed to github;"
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+set -e
+
+NUM=10000000
+
+if [ $# -eq 1 ];then
+  DATA_DIR=$1
+elif [ $# -eq 2 ];then
+  DATA_DIR=$1
+  STAT_FILE=$2
+fi
+
+# On the production build servers, set data and stat
+# files/directories not in /tmp or else the tempdir cleaning
+# scripts will make you very unhappy.
+DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
+STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
+
+function cleanup {
+  rm -rf $DATA_DIR
+  rm -f $STAT_FILE.fillseq
+  rm -f $STAT_FILE.readrandom
+  rm -f $STAT_FILE.overwrite
+  rm -f $STAT_FILE.memtablefillreadrandom
+}
+
+trap cleanup EXIT
+
+if [ -z $GIT_BRANCH ]; then
+  git_br=`git rev-parse --abbrev-ref HEAD`
+else
+  git_br=$(basename $GIT_BRANCH)
+fi
+
+if [ $git_br == "master" ]; then
+  git_br=""
+else
+  git_br="."$git_br
+fi
+
+make release
+
+# measure fillseq + fill up the DB for overwrite benchmark
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq
+
+# measure overwrite performance
+./db_bench \
+    --benchmarks=overwrite \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$((NUM / 10)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6  \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite
+
+# fill up the db for readrandom benchmark (1GB total size)
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# measure readrandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom
+
+# measure readrandom with 6GB block cache and tailing iterator
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --use_tailing_iterator=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandomtailing
+
+# measure readrandom with 100MB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=104857600 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandomsmallblockcache
+
+# measure readrandom with 8k data in memtable
+./db_bench \
+    --benchmarks=overwrite,readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --writes=512 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_mem_sst
+
+
+# fill up the db for readrandom benchmark with filluniquerandom (1GB total size)
+./db_bench \
+    --benchmarks=filluniquerandom \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --writes=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# dummy test just to compact the data
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 1000)) \
+    --reads=$((NUM / 1000)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > /dev/null
+
+# measure readrandom after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --disable_auto_compactions=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
+
+# measure readwhilewriting after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readwhilewriting \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --writes_per_second=1000 \
+    --write_buffer_size=100000000 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readwhilewriting
+
+# measure memtable performance -- none of the data gets flushed to disk
+./db_bench \
+    --benchmarks=fillrandom,readrandom, \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --num=$((NUM / 10)) \
+    --reads=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --disable_seek_compaction=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --value_size=10 \
+    --threads=16 > ${STAT_FILE}.memtablefillreadrandom
+
+# send data to ods
+function send_to_ods {
+  key="$1"
+  value="$2"
+
+  if [ -z $JENKINS_HOME ]; then
+    # running on devbox, just print out the values
+    echo $1 $2
+    return
+  fi
+
+  if [ -z "$value" ];then
+    echo >&2 "ERROR: Key $key doesn't have a value."
+    return
+  fi
+  curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
+    --connect-timeout 60
+}
+
+function send_benchmark_to_ods {
+  bench="$1"
+  bench_key="$2"
+  file="$3"
+
+  QPS=$(grep $bench $file | awk '{print $5}')
+  P50_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $3}' )
+  P75_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $5}' )
+  P99_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $7}' )
+
+  send_to_ods rocksdb.build.$bench_key.qps $QPS
+  send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS
+  send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS
+  send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS
+}
+
+send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite
+send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq
+send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom
+send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing
+send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache
+send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst
+send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
+send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
--- a/build_tools/valgrind_test.sh
+++ b/build_tools/valgrind_test.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+#A shell script for Jenknis to run valgrind on rocksdb tests
+#Returns 0 on success when there are no failed tests 
+
+VALGRIND_DIR=build_tools/VALGRIND_LOGS
+make clean
+make -j$(nproc) valgrind_check
+NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1))
+if [ $NUM_FAILED_TESTS -lt 1 ]; then
+  echo No tests have valgrind errors
+  exit 0
+else
+  cat $VALGRIND_DIR/valgrind_failed_tests
+  exit 1
+fi
--- a/coverage/coverage_test.sh
+++ b/coverage/coverage_test.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# Exit on error.
+set -e
+
+if [ -n "$USE_CLANG" ]; then
+  echo "Error: Coverage test is supported only for gcc."
+  exit 1
+fi
+
+ROOT=".."
+# Fetch right version of gcov
+if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+  source $ROOT/build_tools/fbcode.gcc471.sh
+  GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
+else
+  GCOV=$(which gcov)
+fi
+
+COVERAGE_DIR="$PWD/COVERAGE_REPORT"
+mkdir -p $COVERAGE_DIR
+
+# Find all gcno files to generate the coverage report
+
+GCNO_FILES=`find $ROOT -name "*.gcno"`
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  # Parse the raw gcov report to more human readable form.
+  python $ROOT/coverage/parse_gcov_output.py |
+  # Write the output to both stdout and report file.
+  tee $COVERAGE_DIR/coverage_report_all.txt &&
+echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
+
+# TODO: we also need to get the files of the latest commits.
+# Get the most recently committed files.
+LATEST_FILES=`
+  git show --pretty="format:" --name-only HEAD |
+  grep -v "^$" |
+  paste -s -d,`
+RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
+
+echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
+  tee -a $RECENT_REPORT &&
+echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
+
+# Unless otherwise specified, we'll not generate html report by default
+if [ -z "$HTML" ]; then
+  exit 0
+fi
+
+# Generate the html report. If we cannot find lcov in this machine, we'll simply
+# skip this step.
+echo "Generating the html coverage report..."
+
+LCOV=$(which lcov || true 2>/dev/null)
+if [ -z $LCOV ]
+then
+  echo "Skip: Cannot find lcov to generate the html report."
+  exit 0
+fi
+
+LCOV_VERSION=$(lcov -v | grep 1.1 || true)
+if [ $LCOV_VERSION ]
+then
+  echo "Not supported lcov version. Expect lcov 1.1."
+  exit 0
+fi
+
+(cd $ROOT; lcov --no-external \
+     --capture  \
+     --directory $PWD \
+     --gcov-tool $GCOV \
+     --output-file $COVERAGE_DIR/coverage.info)
+
+genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
+
+echo "HTML Coverage report is generated in $COVERAGE_DIR"
--- a/coverage/parse_gcov_output.py
+++ b/coverage/parse_gcov_output.py
@@ -0,0 +1,118 @@
+import optparse
+import re
+import sys
+
+from optparse import OptionParser
+
+# the gcov report follows certain pattern. Each file will have two lines
+# of report, from which we can extract the file name, total lines and coverage
+# percentage.
+def parse_gcov_report(gcov_input):
+    per_file_coverage = {}
+    total_coverage = None
+
+    for line in sys.stdin:
+        line = line.strip()
+
+        # --First line of the coverage report (with file name in it)?
+        match_obj = re.match("^File '(.*)'$", line)
+        if match_obj:
+            # fetch the file name from the first line of the report.
+            current_file = match_obj.group(1)
+            continue
+
+        # -- Second line of the file report (with coverage percentage)
+        match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
+
+        if match_obj:
+            coverage = float(match_obj.group(1))
+            lines = int(match_obj.group(2))
+
+            if current_file is not None:
+                per_file_coverage[current_file] = (coverage, lines)
+                current_file = None
+            else:
+                # If current_file is not set, we reach the last line of report,
+                # which contains the summarized coverage percentage.
+                total_coverage = (coverage, lines)
+            continue
+
+        # If the line's pattern doesn't fall into the above categories. We
+        # can simply ignore them since they're either empty line or doesn't
+        # find executable lines of the given file.
+        current_file = None
+
+    return per_file_coverage, total_coverage
+
+def get_option_parser():
+    usage = "Parse the gcov output and generate more human-readable code " +\
+            "coverage report."
+    parser = OptionParser(usage)
+
+    parser.add_option(
+        "--interested-files", "-i",
+        dest="filenames",
+        help="Comma separated files names. if specified, we will display " +
+             "the coverage report only for interested source files. " +
+             "Otherwise we will display the coverage report for all " +
+             "source files."
+    )
+    return parser
+
+def display_file_coverage(per_file_coverage, total_coverage):
+    # To print out auto-adjustable column, we need to know the longest
+    # length of file names.
+    max_file_name_length = max(
+        len(fname) for fname in per_file_coverage.keys()
+    )
+
+    # -- Print header
+    # size of separator is determined by 3 column sizes:
+    # file name, coverage percentage and lines.
+    header_template = \
+        "%" + str(max_file_name_length) + "s\t%s\t%s"
+    separator = "-" * (max_file_name_length + 10 + 20)
+    print header_template % ("Filename", "Coverage", "Lines")
+    print separator
+
+    # -- Print body
+    # template for printing coverage report for each file.
+    record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
+
+    for fname, coverage_info in per_file_coverage.items():
+        coverage, lines = coverage_info
+        print record_template % (fname, coverage, lines)
+
+    # -- Print footer
+    if total_coverage:
+        print separator
+        print record_template % ("Total", total_coverage[0], total_coverage[1])
+
+def report_coverage():
+    parser = get_option_parser()
+    (options, args) = parser.parse_args()
+
+    interested_files = set()
+    if options.filenames is not None:
+        interested_files = set(f.strip() for f in options.filenames.split(','))
+
+    # To make things simple, right now we only read gcov report from the input
+    per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
+
+    # Check if we need to display coverage info for interested files.
+    if len(interested_files):
+        per_file_coverage = dict(
+            (fname, per_file_coverage[fname]) for fname in interested_files
+            if fname in per_file_coverage
+        )
+        # If we only interested in several files, it makes no sense to report
+        # the total_coverage
+        total_coverage = None
+
+    if not len(per_file_coverage):
+        print >> sys.stderr, "Cannot find coverage info for the given files."
+        return
+    display_file_coverage(per_file_coverage, total_coverage)
+
+if __name__ == "__main__":
+    report_coverage()
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -0,0 +1,224 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/merge_helper.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based_table_builder.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+class TableFactory;
+
+TableBuilder* NewTableBuilder(const Options& options,
+                              const InternalKeyComparator& internal_comparator,
+                              WritableFile* file,
+                              CompressionType compression_type) {
+  return options.table_factory->NewTableBuilder(options, internal_comparator,
+                                                file, compression_type);
+}
+
+Status BuildTable(const std::string& dbname, Env* env, const Options& options,
+                  const EnvOptions& soptions, TableCache* table_cache,
+                  Iterator* iter, FileMetaData* meta,
+                  const InternalKeyComparator& internal_comparator,
+                  const SequenceNumber newest_snapshot,
+                  const SequenceNumber earliest_seqno_in_memtable,
+                  const CompressionType compression) {
+  Status s;
+  meta->file_size = 0;
+  meta->smallest_seqno = meta->largest_seqno = 0;
+  iter->SeekToFirst();
+
+  // If the sequence number of the smallest entry in the memtable is
+  // smaller than the most recent snapshot, then we do not trigger
+  // removal of duplicate/deleted keys as part of this builder.
+  bool purge = options.purge_redundant_kvs_while_flush;
+  if (earliest_seqno_in_memtable <= newest_snapshot) {
+    purge = false;
+  }
+
+  std::string fname = TableFileName(dbname, meta->number);
+  if (iter->Valid()) {
+    unique_ptr<WritableFile> file;
+    s = env->NewWritableFile(fname, &file, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TableBuilder* builder =
+        NewTableBuilder(options, internal_comparator, file.get(), compression);
+
+    // the first key is the smallest key
+    Slice key = iter->key();
+    meta->smallest.DecodeFrom(key);
+    meta->smallest_seqno = GetInternalKeySeqno(key);
+    meta->largest_seqno = meta->smallest_seqno;
+
+    MergeHelper merge(internal_comparator.user_comparator(),
+                      options.merge_operator.get(), options.info_log.get(),
+                      options.min_partial_merge_operands,
+                      true /* internal key corruption is not ok */);
+
+    if (purge) {
+      // Ugly walkaround to avoid compiler error for release build
+      bool ok __attribute__((unused)) = true;
+
+      // Will write to builder if current key != prev key
+      ParsedInternalKey prev_ikey;
+      std::string prev_key;
+      bool is_first_key = true;    // Also write if this is the very first key
+
+      while (iter->Valid()) {
+        bool iterator_at_next = false;
+
+        // Get current key
+        ParsedInternalKey this_ikey;
+        Slice key = iter->key();
+        Slice value = iter->value();
+
+        // In-memory key corruption is not ok;
+        // TODO: find a clean way to treat in memory key corruption
+        ok = ParseInternalKey(key, &this_ikey);
+        assert(ok);
+        assert(this_ikey.sequence >= earliest_seqno_in_memtable);
+
+        // If the key is the same as the previous key (and it is not the
+        // first key), then we skip it, since it is an older version.
+        // Otherwise we output the key and mark it as the "new" previous key.
+        if (!is_first_key && !internal_comparator.user_comparator()->Compare(
+                                  prev_ikey.user_key, this_ikey.user_key)) {
+          // seqno within the same key are in decreasing order
+          assert(this_ikey.sequence < prev_ikey.sequence);
+        } else {
+          is_first_key = false;
+
+          if (this_ikey.type == kTypeMerge) {
+            // Handle merge-type keys using the MergeHelper
+            // TODO: pass statistics to MergeUntil
+            merge.MergeUntil(iter, 0 /* don't worry about snapshot */);
+            iterator_at_next = true;
+            if (merge.IsSuccess()) {
+              // Merge completed correctly.
+              // Add the resulting merge key/value and continue to next
+              builder->Add(merge.key(), merge.value());
+              prev_key.assign(merge.key().data(), merge.key().size());
+              ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+              assert(ok);
+            } else {
+              // Merge did not find a Put/Delete.
+              // Can not compact these merges into a kValueType.
+              // Write them out one-by-one. (Proceed back() to front())
+              const std::deque<std::string>& keys = merge.keys();
+              const std::deque<std::string>& values = merge.values();
+              assert(keys.size() == values.size() && keys.size() >= 1);
+              std::deque<std::string>::const_reverse_iterator key_iter;
+              std::deque<std::string>::const_reverse_iterator value_iter;
+              for (key_iter=keys.rbegin(), value_iter = values.rbegin();
+                   key_iter != keys.rend() && value_iter != values.rend();
+                   ++key_iter, ++value_iter) {
+
+                builder->Add(Slice(*key_iter), Slice(*value_iter));
+              }
+
+              // Sanity check. Both iterators should end at the same time
+              assert(key_iter == keys.rend() && value_iter == values.rend());
+
+              prev_key.assign(keys.front());
+              ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+              assert(ok);
+            }
+          } else {
+            // Handle Put/Delete-type keys by simply writing them
+            builder->Add(key, value);
+            prev_key.assign(key.data(), key.size());
+            ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+            assert(ok);
+          }
+        }
+
+        if (!iterator_at_next) iter->Next();
+      }
+
+      // The last key is the largest key
+      meta->largest.DecodeFrom(Slice(prev_key));
+      SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key));
+      meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
+      meta->largest_seqno = std::max(meta->largest_seqno, seqno);
+
+    } else {
+      for (; iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        meta->largest.DecodeFrom(key);
+        builder->Add(key, iter->value());
+        SequenceNumber seqno = GetInternalKeySeqno(key);
+        meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
+        meta->largest_seqno = std::max(meta->largest_seqno, seqno);
+      }
+    }
+
+    // Finish and check for builder errors
+    if (s.ok()) {
+      s = builder->Finish();
+      if (s.ok()) {
+        meta->file_size = builder->FileSize();
+        assert(meta->file_size > 0);
+      }
+    } else {
+      builder->Abandon();
+    }
+    delete builder;
+
+    // Finish and check for file errors
+    if (s.ok() && !options.disableDataSync) {
+      if (options.use_fsync) {
+        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+        s = file->Fsync();
+      } else {
+        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+        s = file->Sync();
+      }
+    }
+    if (s.ok()) {
+      s = file->Close();
+    }
+
+    if (s.ok()) {
+      // Verify that the table is usable
+      Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
+                                              internal_comparator, *meta);
+      s = it->status();
+      delete it;
+    }
+  }
+
+  // Check for input iterator errors
+  if (!iter->status().ok()) {
+    s = iter->status();
+  }
+
+  if (s.ok() && meta->file_size > 0) {
+    // Keep it
+  } else {
+    env->DeleteFile(fname);
+  }
+  return s;
+}
+
+}  // namespace rocksdb
--- a/db/builder.h
+++ b/db/builder.h
@@ -0,0 +1,45 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "rocksdb/comparator.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+struct EnvOptions;
+class Iterator;
+class TableCache;
+class VersionEdit;
+class TableBuilder;
+class WritableFile;
+
+extern TableBuilder* NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_comparator,
+    WritableFile* file, CompressionType compression_type);
+
+// Build a Table file from the contents of *iter.  The generated file
+// will be named according to meta->number.  On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+extern Status BuildTable(const std::string& dbname, Env* env,
+                         const Options& options, const EnvOptions& soptions,
+                         TableCache* table_cache, Iterator* iter,
+                         FileMetaData* meta,
+                         const InternalKeyComparator& internal_comparator,
+                         const SequenceNumber newest_snapshot,
+                         const SequenceNumber earliest_seqno_in_memtable,
+                         const CompressionType compression);
+
+}  // namespace rocksdb
--- a/db/c.cc
+++ b/db/c.cc
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -0,0 +1,494 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style license that can be
+   found in the LICENSE file. See the AUTHORS file for names of contributors. */
+
+#include "rocksdb/c.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+const char* phase = "";
+static char dbname[200];
+
+static void StartPhase(const char* name) {
+  fprintf(stderr, "=== Test %s\n", name);
+  phase = name;
+}
+
+static const char* GetTempDir(void) {
+    const char* ret = getenv("TEST_TMPDIR");
+    if (ret == NULL || ret[0] == '\0')
+        ret = "/tmp";
+    return ret;
+}
+
+#define CheckNoError(err)                                               \
+  if ((err) != NULL) {                                                  \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+    abort();                                                            \
+  }
+
+#define CheckCondition(cond)                                            \
+  if (!(cond)) {                                                        \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+    abort();                                                            \
+  }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+  if (expected == NULL && v == NULL) {
+    // ok
+  } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+             memcmp(expected, v, n) == 0) {
+    // ok
+    return;
+  } else {
+    fprintf(stderr, "%s: expected '%s', got '%s'\n",
+            phase,
+            (expected ? expected : "(null)"),
+            (v ? v : "(null"));
+    abort();
+  }
+}
+
+static void Free(char** ptr) {
+  if (*ptr) {
+    free(*ptr);
+    *ptr = NULL;
+  }
+}
+
+static void CheckGet(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key,
+    const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckIter(rocksdb_iterator_t* iter,
+                      const char* key, const char* val) {
+  size_t len;
+  const char* str;
+  str = rocksdb_iter_key(iter, &len);
+  CheckEqual(key, str, len);
+  str = rocksdb_iter_value(iter, &len);
+  CheckEqual(val, str, len);
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckPut(void* ptr,
+                     const char* k, size_t klen,
+                     const char* v, size_t vlen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state < 2);
+  switch (*state) {
+    case 0:
+      CheckEqual("bar", k, klen);
+      CheckEqual("b", v, vlen);
+      break;
+    case 1:
+      CheckEqual("box", k, klen);
+      CheckEqual("c", v, vlen);
+      break;
+  }
+  (*state)++;
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state == 2);
+  CheckEqual("bar", k, klen);
+  (*state)++;
+}
+
+static void CmpDestroy(void* arg) { }
+
+static int CmpCompare(void* arg, const char* a, size_t alen,
+                      const char* b, size_t blen) {
+  int n = (alen < blen) ? alen : blen;
+  int r = memcmp(a, b, n);
+  if (r == 0) {
+    if (alen < blen) r = -1;
+    else if (alen > blen) r = +1;
+  }
+  return r;
+}
+
+static const char* CmpName(void* arg) {
+  return "foo";
+}
+
+// Custom filter policy
+static unsigned char fake_filter_result = 1;
+static void FilterDestroy(void* arg) { }
+static const char* FilterName(void* arg) {
+  return "TestFilter";
+}
+static char* FilterCreate(
+    void* arg,
+    const char* const* key_array, const size_t* key_length_array,
+    int num_keys,
+    size_t* filter_length) {
+  *filter_length = 4;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+static unsigned char FilterKeyMatch(
+    void* arg,
+    const char* key, size_t length,
+    const char* filter, size_t filter_length) {
+  CheckCondition(filter_length == 4);
+  CheckCondition(memcmp(filter, "fake", 4) == 0);
+  return fake_filter_result;
+}
+
+// Custom merge operator
+static void MergeOperatorDestroy(void* arg) { }
+static const char* MergeOperatorName(void* arg) {
+  return "TestMergeOperator";
+}
+static char* MergeOperatorFullMerge(
+    void* arg,
+    const char* key, size_t key_length,
+    const char* existing_value, size_t existing_value_length,
+    const char* const* operands_list, const size_t* operands_list_length,
+    int num_operands,
+    unsigned char* success, size_t* new_value_length) {
+  *new_value_length = 4;
+  *success = 1;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+static char* MergeOperatorPartialMerge(
+    void* arg,
+    const char* key, size_t key_length,
+    const char* const* operands_list, const size_t* operands_list_length,
+    int num_operands,
+    unsigned char* success, size_t* new_value_length) {
+  *new_value_length = 4;
+  *success = 1;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+
+int main(int argc, char** argv) {
+  rocksdb_t* db;
+  rocksdb_comparator_t* cmp;
+  rocksdb_cache_t* cache;
+  rocksdb_env_t* env;
+  rocksdb_options_t* options;
+  rocksdb_readoptions_t* roptions;
+  rocksdb_writeoptions_t* woptions;
+  char* err = NULL;
+  int run = -1;
+
+  snprintf(dbname, sizeof(dbname),
+           "%s/rocksdb_c_test-%d",
+           GetTempDir(),
+           ((int) geteuid()));
+
+  StartPhase("create_objects");
+  cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+  env = rocksdb_create_default_env();
+  cache = rocksdb_cache_create_lru(100000);
+
+  options = rocksdb_options_create();
+  rocksdb_options_set_comparator(options, cmp);
+  rocksdb_options_set_error_if_exists(options, 1);
+  rocksdb_options_set_cache(options, cache);
+  rocksdb_options_set_env(options, env);
+  rocksdb_options_set_info_log(options, NULL);
+  rocksdb_options_set_write_buffer_size(options, 100000);
+  rocksdb_options_set_paranoid_checks(options, 1);
+  rocksdb_options_set_max_open_files(options, 10);
+  rocksdb_options_set_block_size(options, 1024);
+  rocksdb_options_set_block_restart_interval(options, 8);
+  rocksdb_options_set_compression(options, rocksdb_no_compression);
+  rocksdb_options_set_compression_options(options, -14, -1, 0);
+  int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
+                              rocksdb_no_compression, rocksdb_no_compression};
+  rocksdb_options_set_compression_per_level(options, compression_levels, 4);
+
+  roptions = rocksdb_readoptions_create();
+  rocksdb_readoptions_set_verify_checksums(roptions, 1);
+  rocksdb_readoptions_set_fill_cache(roptions, 0);
+
+  woptions = rocksdb_writeoptions_create();
+  rocksdb_writeoptions_set_sync(woptions, 1);
+
+  StartPhase("destroy");
+  rocksdb_destroy_db(options, dbname, &err);
+  Free(&err);
+
+  StartPhase("open_error");
+  db = rocksdb_open(options, dbname, &err);
+  CheckCondition(err != NULL);
+  Free(&err);
+
+  StartPhase("open");
+  rocksdb_options_set_create_if_missing(options, 1);
+  db = rocksdb_open(options, dbname, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", NULL);
+
+  StartPhase("put");
+  rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactall");
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrange");
+  rocksdb_compact_range(db, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("writebatch");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+    rocksdb_writebatch_delete(wb, "bar", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("iter");
+  {
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+  }
+
+  StartPhase("approximate_sizes");
+  {
+    int i;
+    int n = 20000;
+    char keybuf[100];
+    char valbuf[100];
+    uint64_t sizes[2];
+    const char* start[2] = { "a", "k00000000000000010000" };
+    size_t start_len[2] = { 1, 21 };
+    const char* limit[2] = { "k00000000000000010000", "z" };
+    size_t limit_len[2] = { 21, 1 };
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    for (i = 0; i < n; i++) {
+      snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+      snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+      rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+                  &err);
+      CheckNoError(err);
+    }
+    rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+    CheckCondition(sizes[0] > 0);
+    CheckCondition(sizes[1] > 0);
+  }
+
+  StartPhase("property");
+  {
+    char* prop = rocksdb_property_value(db, "nosuchprop");
+    CheckCondition(prop == NULL);
+    prop = rocksdb_property_value(db, "rocksdb.stats");
+    CheckCondition(prop != NULL);
+    Free(&prop);
+  }
+
+  StartPhase("snapshot");
+  {
+    const rocksdb_snapshot_t* snap;
+    snap = rocksdb_create_snapshot(db);
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", "hello");
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    CheckGet(db, roptions, "foo", NULL);
+    rocksdb_release_snapshot(db, snap);
+  }
+
+  StartPhase("repair");
+  {
+    // If we do not compact here, then the lazy deletion of
+    // files (https://reviews.facebook.net/D6123) would leave
+    // around deleted files and the repair process will find
+    // those files and put them back into the database.
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+    rocksdb_close(db);
+    rocksdb_options_set_create_if_missing(options, 0);
+    rocksdb_options_set_error_if_exists(options, 0);
+    rocksdb_repair_db(options, dbname, &err);
+    CheckNoError(err);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", NULL);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    rocksdb_options_set_create_if_missing(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("filter");
+  for (run = 0; run < 2; run++) {
+    // First run uses custom filter, second run uses bloom filter
+    CheckNoError(err);
+    rocksdb_filterpolicy_t* policy;
+    if (run == 0) {
+      policy = rocksdb_filterpolicy_create(
+          NULL, FilterDestroy, FilterCreate, FilterKeyMatch, NULL, FilterName);
+    } else {
+      policy = rocksdb_filterpolicy_create_bloom(10);
+    }
+
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_filter_policy(options, policy);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+    fake_filter_result = 1;
+    CheckGet(db, roptions, "foo", "foovalue");
+    CheckGet(db, roptions, "bar", "barvalue");
+    if (phase == 0) {
+      // Must not find value when custom filter returns false
+      fake_filter_result = 0;
+      CheckGet(db, roptions, "foo", NULL);
+      CheckGet(db, roptions, "bar", NULL);
+      fake_filter_result = 1;
+
+      CheckGet(db, roptions, "foo", "foovalue");
+      CheckGet(db, roptions, "bar", "barvalue");
+    }
+    rocksdb_options_set_filter_policy(options, NULL);
+    rocksdb_filterpolicy_destroy(policy);
+  }
+
+  StartPhase("merge_operator");
+  {
+    rocksdb_mergeoperator_t* merge_operator;
+    merge_operator = rocksdb_mergeoperator_create(
+        NULL, MergeOperatorDestroy, MergeOperatorFullMerge,
+        MergeOperatorPartialMerge, NULL, MergeOperatorName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_merge_operator(options, merge_operator);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "foovalue");
+    rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "fake");
+
+    // Merge of a non-existing value
+    rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bar", "fake");
+
+  }
+
+  StartPhase("prefix");
+  {
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10);
+    rocksdb_options_set_filter_policy(options, policy);
+    rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
+    rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4);
+    rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+
+    rocksdb_iter_seek(iter, "bar", 3);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    CheckIter(iter, "bar1", "bar");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "bar2", "bar");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "bar3", "bar");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+    rocksdb_filterpolicy_destroy(policy);
+  }
+
+  StartPhase("cleanup");
+  rocksdb_close(db);
+  rocksdb_options_destroy(options);
+  rocksdb_readoptions_destroy(roptions);
+  rocksdb_writeoptions_destroy(woptions);
+  rocksdb_cache_destroy(cache);
+  rocksdb_comparator_destroy(cmp);
+  rocksdb_env_destroy(env);
+
+  fprintf(stderr, "PASS\n");
+  return 0;
+}
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -0,0 +1,604 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/column_family.h"
+
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <limits>
+
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "db/internal_stats.h"
+#include "db/compaction_picker.h"
+#include "db/table_properties_collector.h"
+#include "util/autovector.h"
+#include "util/hash_skiplist_rep.h"
+
+namespace rocksdb {
+
+ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
+                                               DBImpl* db, port::Mutex* mutex)
+    : cfd_(cfd), db_(db), mutex_(mutex) {
+  if (cfd_ != nullptr) {
+    cfd_->Ref();
+  }
+}
+
+ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
+  if (cfd_ != nullptr) {
+    DBImpl::DeletionState deletion_state;
+    mutex_->Lock();
+    if (cfd_->Unref()) {
+      delete cfd_;
+    }
+    db_->FindObsoleteFiles(deletion_state, false, true);
+    mutex_->Unlock();
+    if (deletion_state.HaveSomethingToDelete()) {
+      db_->PurgeObsoleteFiles(deletion_state);
+    }
+  }
+}
+
+uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
+
+namespace {
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+}  // anonymous namespace
+
+ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
+                                    const InternalFilterPolicy* ipolicy,
+                                    const ColumnFamilyOptions& src) {
+  ColumnFamilyOptions result = src;
+  result.comparator = icmp;
+  result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
+#ifdef OS_MACOSX
+  // TODO(icanadi) make write_buffer_size uint64_t instead of size_t
+  ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30);
+#else
+  ClipToRange(&result.write_buffer_size,
+              ((size_t)64) << 10, ((size_t)64) << 30);
+#endif
+  // if user sets arena_block_size, we trust user to use this value. Otherwise,
+  // calculate a proper value from writer_buffer_size;
+  if (result.arena_block_size <= 0) {
+    result.arena_block_size = result.write_buffer_size / 10;
+  }
+  result.min_write_buffer_number_to_merge =
+      std::min(result.min_write_buffer_number_to_merge,
+               result.max_write_buffer_number - 1);
+  if (result.block_cache == nullptr && !result.no_block_cache) {
+    result.block_cache = NewLRUCache(8 << 20);
+  }
+  result.compression_per_level = src.compression_per_level;
+  if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
+    result.block_size_deviation = 0;
+  }
+  if (result.max_mem_compaction_level >= result.num_levels) {
+    result.max_mem_compaction_level = result.num_levels - 1;
+  }
+  if (result.soft_rate_limit > result.hard_rate_limit) {
+    result.soft_rate_limit = result.hard_rate_limit;
+  }
+  if (!result.prefix_extractor) {
+    assert(result.memtable_factory);
+    Slice name = result.memtable_factory->Name();
+    if (name.compare("HashSkipListRepFactory") == 0 ||
+        name.compare("HashLinkListRepFactory") == 0) {
+      result.memtable_factory = std::make_shared<SkipListFactory>();
+    }
+  }
+
+  // -- Sanitize the table properties collector
+  // All user defined properties collectors will be wrapped by
+  // UserKeyTablePropertiesCollector since for them they only have the
+  // knowledge of the user keys; internal keys are invisible to them.
+  auto& collector_factories = result.table_properties_collector_factories;
+  for (size_t i = 0; i < result.table_properties_collector_factories.size();
+       ++i) {
+    assert(collector_factories[i]);
+    collector_factories[i] =
+        std::make_shared<UserKeyTablePropertiesCollectorFactory>(
+            collector_factories[i]);
+  }
+  // Add collector to collect internal key statistics
+  collector_factories.push_back(
+      std::make_shared<InternalKeyPropertiesCollectorFactory>());
+
+  if (result.compaction_style == kCompactionStyleFIFO) {
+    result.num_levels = 1;
+    // since we delete level0 files in FIFO compaction when there are too many
+    // of them, these options don't really mean anything
+    result.level0_file_num_compaction_trigger = std::numeric_limits<int>::max();
+    result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+    result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+  }
+
+  return result;
+}
+
+int SuperVersion::dummy = 0;
+void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
+void* const SuperVersion::kSVObsolete = nullptr;
+
+SuperVersion::~SuperVersion() {
+  for (auto td : to_delete) {
+    delete td;
+  }
+}
+
+SuperVersion* SuperVersion::Ref() {
+  refs.fetch_add(1, std::memory_order_relaxed);
+  return this;
+}
+
+bool SuperVersion::Unref() {
+  // fetch_sub returns the previous value of ref
+  uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
+  assert(previous_refs > 0);
+  return previous_refs == 1;
+}
+
+void SuperVersion::Cleanup() {
+  assert(refs.load(std::memory_order_relaxed) == 0);
+  imm->Unref(&to_delete);
+  MemTable* m = mem->Unref();
+  if (m != nullptr) {
+    to_delete.push_back(m);
+  }
+  current->Unref();
+}
+
+void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
+                        Version* new_current) {
+  mem = new_mem;
+  imm = new_imm;
+  current = new_current;
+  mem->Ref();
+  imm->Ref();
+  current->Ref();
+  refs.store(1, std::memory_order_relaxed);
+}
+
+namespace {
+void SuperVersionUnrefHandle(void* ptr) {
+  // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
+  // destroyed. When former happens, the thread shouldn't see kSVInUse.
+  // When latter happens, we are in ~ColumnFamilyData(), no get should happen as
+  // well.
+  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+  if (sv->Unref()) {
+    sv->db_mutex->Lock();
+    sv->Cleanup();
+    sv->db_mutex->Unlock();
+    delete sv;
+  }
+}
+}  // anonymous namespace
+
+ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
+                                   const std::string& name,
+                                   Version* dummy_versions, Cache* table_cache,
+                                   const ColumnFamilyOptions& options,
+                                   const DBOptions* db_options,
+                                   const EnvOptions& storage_options,
+                                   ColumnFamilySet* column_family_set)
+    : id_(id),
+      name_(name),
+      dummy_versions_(dummy_versions),
+      current_(nullptr),
+      refs_(0),
+      dropped_(false),
+      internal_comparator_(options.comparator),
+      internal_filter_policy_(options.filter_policy),
+      options_(*db_options, SanitizeOptions(&internal_comparator_,
+                                            &internal_filter_policy_, options)),
+      mem_(nullptr),
+      imm_(options_.min_write_buffer_number_to_merge),
+      super_version_(nullptr),
+      super_version_number_(0),
+      local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
+      next_(nullptr),
+      prev_(nullptr),
+      log_number_(0),
+      need_slowdown_for_num_level0_files_(false),
+      column_family_set_(column_family_set) {
+  Ref();
+
+  // if dummy_versions is nullptr, then this is a dummy column family.
+  if (dummy_versions != nullptr) {
+    internal_stats_.reset(new InternalStats(
+        options_.num_levels, db_options->env, db_options->statistics.get()));
+    table_cache_.reset(
+        new TableCache(dbname, &options_, storage_options, table_cache));
+    if (options_.compaction_style == kCompactionStyleUniversal) {
+      compaction_picker_.reset(
+          new UniversalCompactionPicker(&options_, &internal_comparator_));
+    } else if (options_.compaction_style == kCompactionStyleLevel) {
+      compaction_picker_.reset(
+          new LevelCompactionPicker(&options_, &internal_comparator_));
+    } else {
+      assert(options_.compaction_style == kCompactionStyleFIFO);
+      compaction_picker_.reset(
+          new FIFOCompactionPicker(&options_, &internal_comparator_));
+    }
+
+    Log(options_.info_log, "Options for column family \"%s\":\n",
+        name.c_str());
+    const ColumnFamilyOptions* cf_options = &options_;
+    cf_options->Dump(options_.info_log.get());
+  }
+}
+
+// DB mutex held
+ColumnFamilyData::~ColumnFamilyData() {
+  assert(refs_ == 0);
+  // remove from linked list
+  auto prev = prev_;
+  auto next = next_;
+  prev->next_ = next;
+  next->prev_ = prev;
+
+  // it's nullptr for dummy CFD
+  if (column_family_set_ != nullptr) {
+    // remove from column_family_set
+    column_family_set_->RemoveColumnFamily(this);
+  }
+
+  if (current_ != nullptr) {
+    current_->Unref();
+  }
+
+  if (super_version_ != nullptr) {
+    // Release SuperVersion reference kept in ThreadLocalPtr.
+    // This must be done outside of mutex_ since unref handler can lock mutex.
+    super_version_->db_mutex->Unlock();
+    local_sv_.reset();
+    super_version_->db_mutex->Lock();
+
+    bool is_last_reference __attribute__((unused));
+    is_last_reference = super_version_->Unref();
+    assert(is_last_reference);
+    super_version_->Cleanup();
+    delete super_version_;
+    super_version_ = nullptr;
+  }
+
+  if (dummy_versions_ != nullptr) {
+    // List must be empty
+    assert(dummy_versions_->next_ == dummy_versions_);
+    delete dummy_versions_;
+  }
+
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+  autovector<MemTable*> to_delete;
+  imm_.current()->Unref(&to_delete);
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+const EnvOptions* ColumnFamilyData::soptions() const {
+  return &(column_family_set_->storage_options_);
+}
+
+void ColumnFamilyData::SetCurrent(Version* current) {
+  current_ = current;
+  need_slowdown_for_num_level0_files_ =
+      (options_.level0_slowdown_writes_trigger >= 0 &&
+       current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
+}
+
+void ColumnFamilyData::CreateNewMemtable() {
+  assert(current_ != nullptr);
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+  mem_ = new MemTable(internal_comparator_, options_);
+  mem_->Ref();
+}
+
+Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
+  return compaction_picker_->PickCompaction(current_, log_buffer);
+}
+
+Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
+                                           const InternalKey* begin,
+                                           const InternalKey* end,
+                                           InternalKey** compaction_end) {
+  return compaction_picker_->CompactRange(current_, input_level, output_level,
+                                          begin, end, compaction_end);
+}
+
+SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
+    port::Mutex* db_mutex) {
+  SuperVersion* sv = nullptr;
+  if (LIKELY(column_family_set_->db_options_->allow_thread_local)) {
+    sv = GetThreadLocalSuperVersion(db_mutex);
+    sv->Ref();
+    if (!ReturnThreadLocalSuperVersion(sv)) {
+      sv->Unref();
+    }
+  } else {
+    db_mutex->Lock();
+    sv = super_version_->Ref();
+    db_mutex->Unlock();
+  }
+  return sv;
+}
+
+SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
+    port::Mutex* db_mutex) {
+  SuperVersion* sv = nullptr;
+  // The SuperVersion is cached in thread local storage to avoid acquiring
+  // mutex when SuperVersion does not change since the last use. When a new
+  // SuperVersion is installed, the compaction or flush thread cleans up
+  // cached SuperVersion in all existing thread local storage. To avoid
+  // acquiring mutex for this operation, we use atomic Swap() on the thread
+  // local pointer to guarantee exclusive access. If the thread local pointer
+  // is being used while a new SuperVersion is installed, the cached
+  // SuperVersion can become stale. In that case, the background thread would
+  // have swapped in kSVObsolete. We re-check the value at when returning
+  // SuperVersion back to thread local, with an atomic compare and swap.
+  // The superversion will need to be released if detected to be stale.
+  void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
+  // Invariant:
+  // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
+  // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
+  // should only keep kSVInUse before ReturnThreadLocalSuperVersion call
+  // (if no Scrape happens).
+  assert(ptr != SuperVersion::kSVInUse);
+  sv = static_cast<SuperVersion*>(ptr);
+  if (sv == SuperVersion::kSVObsolete ||
+      sv->version_number != super_version_number_.load()) {
+    RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES);
+    SuperVersion* sv_to_delete = nullptr;
+
+    if (sv && sv->Unref()) {
+      RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
+      db_mutex->Lock();
+      // NOTE: underlying resources held by superversion (sst files) might
+      // not be released until the next background job.
+      sv->Cleanup();
+      sv_to_delete = sv;
+    } else {
+      db_mutex->Lock();
+    }
+    sv = super_version_->Ref();
+    db_mutex->Unlock();
+
+    delete sv_to_delete;
+  }
+  assert(sv != nullptr);
+  return sv;
+}
+
+bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
+  assert(sv != nullptr);
+  // Put the SuperVersion back
+  void* expected = SuperVersion::kSVInUse;
+  if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
+    // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
+    // storage has not been altered and no Scrape has happend. The
+    // SuperVersion is still current.
+    return true;
+  } else {
+    // ThreadLocal scrape happened in the process of this GetImpl call (after
+    // thread local Swap() at the beginning and before CompareAndSwap()).
+    // This means the SuperVersion it holds is obsolete.
+    assert(expected == SuperVersion::kSVObsolete);
+  }
+  return false;
+}
+
+SuperVersion* ColumnFamilyData::InstallSuperVersion(
+    SuperVersion* new_superversion, port::Mutex* db_mutex) {
+  new_superversion->db_mutex = db_mutex;
+  new_superversion->Init(mem_, imm_.current(), current_);
+  SuperVersion* old_superversion = super_version_;
+  super_version_ = new_superversion;
+  ++super_version_number_;
+  super_version_->version_number = super_version_number_;
+  // Reset SuperVersions cached in thread local storage
+  if (column_family_set_->db_options_->allow_thread_local) {
+    ResetThreadLocalSuperVersions();
+  }
+  if (old_superversion != nullptr && old_superversion->Unref()) {
+    old_superversion->Cleanup();
+    return old_superversion;  // will let caller delete outside of mutex
+  }
+  return nullptr;
+}
+
+void ColumnFamilyData::ResetThreadLocalSuperVersions() {
+  autovector<void*> sv_ptrs;
+  local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
+  for (auto ptr : sv_ptrs) {
+    assert(ptr);
+    if (ptr == SuperVersion::kSVInUse) {
+      continue;
+    }
+    auto sv = static_cast<SuperVersion*>(ptr);
+    if (sv->Unref()) {
+      sv->Cleanup();
+      delete sv;
+    }
+  }
+}
+
+ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
+                                 const DBOptions* db_options,
+                                 const EnvOptions& storage_options,
+                                 Cache* table_cache)
+    : max_column_family_(0),
+      dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr,
+                                      ColumnFamilyOptions(), db_options,
+                                      storage_options_, nullptr)),
+      default_cfd_cache_(nullptr),
+      db_name_(dbname),
+      db_options_(db_options),
+      storage_options_(storage_options),
+      table_cache_(table_cache),
+      spin_lock_(ATOMIC_FLAG_INIT) {
+  // initialize linked list
+  dummy_cfd_->prev_ = dummy_cfd_;
+  dummy_cfd_->next_ = dummy_cfd_;
+}
+
+ColumnFamilySet::~ColumnFamilySet() {
+  while (column_family_data_.size() > 0) {
+    // cfd destructor will delete itself from column_family_data_
+    auto cfd = column_family_data_.begin()->second;
+    cfd->Unref();
+    delete cfd;
+  }
+  dummy_cfd_->Unref();
+  delete dummy_cfd_;
+}
+
+ColumnFamilyData* ColumnFamilySet::GetDefault() const {
+  assert(default_cfd_cache_ != nullptr);
+  return default_cfd_cache_;
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
+  auto cfd_iter = column_family_data_.find(id);
+  if (cfd_iter != column_family_data_.end()) {
+    return cfd_iter->second;
+  } else {
+    return nullptr;
+  }
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
+    const {
+  auto cfd_iter = column_families_.find(name);
+  if (cfd_iter != column_families_.end()) {
+    auto cfd = GetColumnFamily(cfd_iter->second);
+    assert(cfd != nullptr);
+    return cfd;
+  } else {
+    return nullptr;
+  }
+}
+
+uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
+  return ++max_column_family_;
+}
+
+uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
+
+void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
+  max_column_family_ = std::max(new_max_column_family, max_column_family_);
+}
+
+size_t ColumnFamilySet::NumberOfColumnFamilies() const {
+  return column_families_.size();
+}
+
+// under a DB mutex
+ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
+    const std::string& name, uint32_t id, Version* dummy_versions,
+    const ColumnFamilyOptions& options) {
+  assert(column_families_.find(name) == column_families_.end());
+  ColumnFamilyData* new_cfd =
+      new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_,
+                           options, db_options_, storage_options_, this);
+  Lock();
+  column_families_.insert({name, id});
+  column_family_data_.insert({id, new_cfd});
+  Unlock();
+  max_column_family_ = std::max(max_column_family_, id);
+  // add to linked list
+  new_cfd->next_ = dummy_cfd_;
+  auto prev = dummy_cfd_->prev_;
+  new_cfd->prev_ = prev;
+  prev->next_ = new_cfd;
+  dummy_cfd_->prev_ = new_cfd;
+  if (id == 0) {
+    default_cfd_cache_ = new_cfd;
+  }
+  return new_cfd;
+}
+
+void ColumnFamilySet::Lock() {
+  // spin lock
+  while (spin_lock_.test_and_set(std::memory_order_acquire)) {
+  }
+}
+
+void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
+
+// REQUIRES: DB mutex held
+void ColumnFamilySet::FreeDeadColumnFamilies() {
+  autovector<ColumnFamilyData*> to_delete;
+  for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
+    if (cfd->refs_ == 0) {
+      to_delete.push_back(cfd);
+    }
+  }
+  for (auto cfd : to_delete) {
+    // this is very rare, so it's not a problem that we do it under a mutex
+    delete cfd;
+  }
+}
+
+// under a DB mutex
+void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
+  auto cfd_iter = column_family_data_.find(cfd->GetID());
+  assert(cfd_iter != column_family_data_.end());
+  Lock();
+  column_family_data_.erase(cfd_iter);
+  column_families_.erase(cfd->GetName());
+  Unlock();
+}
+
+bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
+  if (column_family_id == 0) {
+    // optimization for common case
+    current_ = column_family_set_->GetDefault();
+  } else {
+    // maybe outside of db mutex, should lock
+    column_family_set_->Lock();
+    current_ = column_family_set_->GetColumnFamily(column_family_id);
+    column_family_set_->Unlock();
+  }
+  handle_.SetCFD(current_);
+  return current_ != nullptr;
+}
+
+uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
+  assert(current_ != nullptr);
+  return current_->GetLogNumber();
+}
+
+MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
+  assert(current_ != nullptr);
+  return current_->mem();
+}
+
+const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
+  assert(current_ != nullptr);
+  return current_->options();
+}
+
+ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
+  assert(current_ != nullptr);
+  return &handle_;
+}
+
+}  // namespace rocksdb
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -0,0 +1,419 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <atomic>
+
+#include "rocksdb/options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "db/memtable_list.h"
+#include "db/write_batch_internal.h"
+#include "db/table_cache.h"
+#include "util/thread_local.h"
+
+namespace rocksdb {
+
+class Version;
+class VersionSet;
+class MemTable;
+class MemTableListVersion;
+class CompactionPicker;
+class Compaction;
+class InternalKey;
+class InternalStats;
+class ColumnFamilyData;
+class DBImpl;
+class LogBuffer;
+
+// ColumnFamilyHandleImpl is the class that clients use to access different
+// column families. It has non-trivial destructor, which gets called when client
+// is done using the column family
+class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
+ public:
+  // create while holding the mutex
+  ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
+  // destroy without mutex
+  virtual ~ColumnFamilyHandleImpl();
+  virtual ColumnFamilyData* cfd() const { return cfd_; }
+
+  virtual uint32_t GetID() const;
+
+ private:
+  ColumnFamilyData* cfd_;
+  DBImpl* db_;
+  port::Mutex* mutex_;
+};
+
+// Does not ref-count ColumnFamilyData
+// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
+// calls DBImpl methods. When this happens, MemTableInserter need access to
+// ColumnFamilyHandle (same as the client would need). In that case, we feed
+// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
+// methods
+class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
+ public:
+  ColumnFamilyHandleInternal()
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
+
+  void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
+  virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
+
+ private:
+  ColumnFamilyData* internal_cfd_;
+};
+
+// holds references to memtable, all immutable memtables and version
+struct SuperVersion {
+  MemTable* mem;
+  MemTableListVersion* imm;
+  Version* current;
+  std::atomic<uint32_t> refs;
+  // We need to_delete because during Cleanup(), imm->Unref() returns
+  // all memtables that we need to free through this vector. We then
+  // delete all those memtables outside of mutex, during destruction
+  autovector<MemTable*> to_delete;
+  // Version number of the current SuperVersion
+  uint64_t version_number;
+  port::Mutex* db_mutex;
+
+  // should be called outside the mutex
+  SuperVersion() = default;
+  ~SuperVersion();
+  SuperVersion* Ref();
+
+  bool Unref();
+
+  // call these two methods with db mutex held
+  // Cleanup unrefs mem, imm and current. Also, it stores all memtables
+  // that needs to be deleted in to_delete vector. Unrefing those
+  // objects needs to be done in the mutex
+  void Cleanup();
+  void Init(MemTable* new_mem, MemTableListVersion* new_imm,
+            Version* new_current);
+
+  // The value of dummy is not actually used. kSVInUse takes its address as a
+  // mark in the thread local storage to indicate the SuperVersion is in use
+  // by thread. This way, the value of kSVInUse is guaranteed to have no
+  // conflict with SuperVersion object address and portable on different
+  // platform.
+  static int dummy;
+  static void* const kSVInUse;
+  static void* const kSVObsolete;
+};
+
+extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
+                                           const InternalFilterPolicy* ipolicy,
+                                           const ColumnFamilyOptions& src);
+
+class ColumnFamilySet;
+
+// This class keeps all the data that a column family needs. It's mosly dumb and
+// used just to provide access to metadata.
+// Most methods require DB mutex held, unless otherwise noted
+class ColumnFamilyData {
+ public:
+  ~ColumnFamilyData();
+
+  // thread-safe
+  uint32_t GetID() const { return id_; }
+  // thread-safe
+  const std::string& GetName() const { return name_; }
+
+  void Ref() { ++refs_; }
+  // will just decrease reference count to 0, but will not delete it. returns
+  // true if the ref count was decreased to zero. in that case, it can be
+  // deleted by the caller immediatelly, or later, by calling
+  // FreeDeadColumnFamilies()
+  bool Unref() {
+    assert(refs_ > 0);
+    return --refs_ == 0;
+  }
+
+  // This can only be called from single-threaded VersionSet::LogAndApply()
+  // After dropping column family no other operation on that column family
+  // will be executed. All the files and memory will be, however, kept around
+  // until client drops the column family handle. That way, client can still
+  // access data from dropped column family.
+  // Column family can be dropped and still alive. In that state:
+  // *) Column family is not included in the iteration.
+  // *) Compaction and flush is not executed on the dropped column family.
+  // *) Client can continue writing and reading from column family. However, all
+  // writes stay in the current memtable.
+  // When the dropped column family is unreferenced, then we:
+  // *) delete all memory associated with that column family
+  // *) delete all the files associated with that column family
+  void SetDropped() {
+    // can't drop default CF
+    assert(id_ != 0);
+    dropped_ = true;
+  }
+  bool IsDropped() const { return dropped_; }
+
+  // thread-safe
+  int NumberLevels() const { return options_.num_levels; }
+
+  void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  // thread-safe
+  const Options* options() const { return &options_; }
+  const EnvOptions* soptions() const;
+
+  InternalStats* internal_stats() { return internal_stats_.get(); }
+
+  MemTableList* imm() { return &imm_; }
+  MemTable* mem() { return mem_; }
+  Version* current() { return current_; }
+  Version* dummy_versions() { return dummy_versions_; }
+  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
+  void SetCurrent(Version* current);
+  void CreateNewMemtable();
+
+  TableCache* table_cache() const { return table_cache_.get(); }
+
+  // See documentation in compaction_picker.h
+  Compaction* PickCompaction(LogBuffer* log_buffer);
+  Compaction* CompactRange(int input_level, int output_level,
+                           const InternalKey* begin, const InternalKey* end,
+                           InternalKey** compaction_end);
+
+  CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
+  // thread-safe
+  const Comparator* user_comparator() const {
+    return internal_comparator_.user_comparator();
+  }
+  // thread-safe
+  const InternalKeyComparator& internal_comparator() const {
+    return internal_comparator_;
+  }
+
+  SuperVersion* GetSuperVersion() { return super_version_; }
+  // thread-safe
+  // Return a already referenced SuperVersion to be used safely.
+  SuperVersion* GetReferencedSuperVersion(port::Mutex* db_mutex);
+  // thread-safe
+  // Get SuperVersion stored in thread local storage. If it does not exist,
+  // get a reference from a current SuperVersion.
+  SuperVersion* GetThreadLocalSuperVersion(port::Mutex* db_mutex);
+  // Try to return SuperVersion back to thread local storage. Retrun true on
+  // success and false on failure. It fails when the thread local storage
+  // contains anything other than SuperVersion::kSVInUse flag.
+  bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
+  // thread-safe
+  uint64_t GetSuperVersionNumber() const {
+    return super_version_number_.load();
+  }
+  // will return a pointer to SuperVersion* if previous SuperVersion
+  // if its reference count is zero and needs deletion or nullptr if not
+  // As argument takes a pointer to allocated SuperVersion to enable
+  // the clients to allocate SuperVersion outside of mutex.
+  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
+                                    port::Mutex* db_mutex);
+
+  void ResetThreadLocalSuperVersions();
+
+  // A Flag indicating whether write needs to slowdown because of there are
+  // too many number of level0 files.
+  bool NeedSlowdownForNumLevel0Files() const {
+    return need_slowdown_for_num_level0_files_;
+  }
+
+ private:
+  friend class ColumnFamilySet;
+  ColumnFamilyData(const std::string& dbname, uint32_t id,
+                   const std::string& name, Version* dummy_versions,
+                   Cache* table_cache, const ColumnFamilyOptions& options,
+                   const DBOptions* db_options,
+                   const EnvOptions& storage_options,
+                   ColumnFamilySet* column_family_set);
+
+  uint32_t id_;
+  const std::string name_;
+  Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
+  Version* current_;         // == dummy_versions->prev_
+
+  int refs_;                   // outstanding references to ColumnFamilyData
+  bool dropped_;               // true if client dropped it
+
+  const InternalKeyComparator internal_comparator_;
+  const InternalFilterPolicy internal_filter_policy_;
+
+  Options const options_;
+
+  std::unique_ptr<TableCache> table_cache_;
+
+  std::unique_ptr<InternalStats> internal_stats_;
+
+  MemTable* mem_;
+  MemTableList imm_;
+  SuperVersion* super_version_;
+
+  // An ordinal representing the current SuperVersion. Updated by
+  // InstallSuperVersion(), i.e. incremented every time super_version_
+  // changes.
+  std::atomic<uint64_t> super_version_number_;
+
+  // Thread's local copy of SuperVersion pointer
+  // This needs to be destructed before mutex_
+  std::unique_ptr<ThreadLocalPtr> local_sv_;
+
+  // pointers for a circular linked list. we use it to support iterations
+  // that can be concurrent with writes
+  ColumnFamilyData* next_;
+  ColumnFamilyData* prev_;
+
+  // This is the earliest log file number that contains data from this
+  // Column Family. All earlier log files must be ignored and not
+  // recovered from
+  uint64_t log_number_;
+
+  // A flag indicating whether we should delay writes because
+  // we have too many level 0 files
+  bool need_slowdown_for_num_level0_files_;
+
+  // An object that keeps all the compaction stats
+  // and picks the next compaction
+  std::unique_ptr<CompactionPicker> compaction_picker_;
+
+  ColumnFamilySet* column_family_set_;
+};
+
+// ColumnFamilySet has interesting thread-safety requirements
+// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
+// mutex. Inside, column_family_data_ and column_families_ will be protected
+// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
+// VersionSet::LogAndApply() in the normal runtime. It is also called
+// during Recovery and in DumpManifest(). RemoveColumnFamily() is called
+// from ColumnFamilyData destructor
+// * Iteration -- hold DB mutex, but you can release it in the body of
+// iteration. If you release DB mutex in body, reference the column
+// family before the mutex and unreference after you unlock, since the column
+// family might get dropped when the DB mutex is released
+// * GetDefault() -- thread safe
+// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
+// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
+// NumberOfColumnFamilies -- inside of DB mutex
+class ColumnFamilySet {
+ public:
+  // ColumnFamilySet supports iteration
+  class iterator {
+   public:
+    explicit iterator(ColumnFamilyData* cfd)
+        : current_(cfd) {}
+    iterator& operator++() {
+      // dummy is never dead or dropped, so this will never be infinite
+      do {
+        current_ = current_->next_;
+      } while (current_->refs_ == 0 || current_->IsDropped());
+      return *this;
+    }
+    bool operator!=(const iterator& other) {
+      return this->current_ != other.current_;
+    }
+    ColumnFamilyData* operator*() { return current_; }
+
+   private:
+    ColumnFamilyData* current_;
+  };
+
+  ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
+                  const EnvOptions& storage_options, Cache* table_cache);
+  ~ColumnFamilySet();
+
+  ColumnFamilyData* GetDefault() const;
+  // GetColumnFamily() calls return nullptr if column family is not found
+  ColumnFamilyData* GetColumnFamily(uint32_t id) const;
+  ColumnFamilyData* GetColumnFamily(const std::string& name) const;
+  // this call will return the next available column family ID. it guarantees
+  // that there is no column family with id greater than or equal to the
+  // returned value in the current running instance or anytime in RocksDB
+  // instance history.
+  uint32_t GetNextColumnFamilyID();
+  uint32_t GetMaxColumnFamily();
+  void UpdateMaxColumnFamily(uint32_t new_max_column_family);
+  size_t NumberOfColumnFamilies() const;
+
+  ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
+                                       Version* dummy_version,
+                                       const ColumnFamilyOptions& options);
+
+  iterator begin() { return iterator(dummy_cfd_->next_); }
+  iterator end() { return iterator(dummy_cfd_); }
+
+  void Lock();
+  void Unlock();
+
+  // REQUIRES: DB mutex held
+  // Don't call while iterating over ColumnFamilySet
+  void FreeDeadColumnFamilies();
+
+ private:
+  friend class ColumnFamilyData;
+  // helper function that gets called from cfd destructor
+  // REQUIRES: DB mutex held
+  void RemoveColumnFamily(ColumnFamilyData* cfd);
+
+  // column_families_ and column_family_data_ need to be protected:
+  // * when mutating: 1. DB mutex locked first, 2. spinlock locked second
+  // * when reading, either: 1. lock DB mutex, or 2. lock spinlock
+  //  (if both, respect the ordering to avoid deadlock!)
+  std::unordered_map<std::string, uint32_t> column_families_;
+  std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
+
+  uint32_t max_column_family_;
+  ColumnFamilyData* dummy_cfd_;
+  // We don't hold the refcount here, since default column family always exists
+  // We are also not responsible for cleaning up default_cfd_cache_. This is
+  // just a cache that makes common case (accessing default column family)
+  // faster
+  ColumnFamilyData* default_cfd_cache_;
+
+  const std::string db_name_;
+  const DBOptions* const db_options_;
+  const EnvOptions storage_options_;
+  Cache* table_cache_;
+  std::atomic_flag spin_lock_;
+};
+
+// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
+// memtables of different column families (specified by ID in the write batch)
+class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
+ public:
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
+      : column_family_set_(column_family_set), current_(nullptr) {}
+
+  // sets current_ to ColumnFamilyData with column_family_id
+  // returns false if column family doesn't exist
+  bool Seek(uint32_t column_family_id) override;
+
+  // Returns log number of the selected column family
+  uint64_t GetLogNumber() const override;
+
+  // REQUIRES: Seek() called first
+  virtual MemTable* GetMemTable() const override;
+
+  // Returns options for selected column family
+  // REQUIRES: Seek() called first
+  virtual const Options* GetOptions() const override;
+
+  // Returns column family handle for the selected column family
+  virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
+
+ private:
+  ColumnFamilySet* column_family_set_;
+  ColumnFamilyData* current_;
+  ColumnFamilyHandleInternal handle_;
+};
+
+}  // namespace rocksdb
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -0,0 +1,977 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <vector>
+#include <string>
+
+#include "db/db_impl.h"
+#include "rocksdb/env.h"
+#include "rocksdb/db.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+namespace {
+std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+}  // anonymous namespace
+
+// counts how many operations were performed
+class EnvCounter : public EnvWrapper {
+ public:
+  explicit EnvCounter(Env* base)
+      : EnvWrapper(base), num_new_writable_file_(0) {}
+  int GetNumberOfNewWritableFileCalls() {
+    return num_new_writable_file_;
+  }
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) {
+    ++num_new_writable_file_;
+    return EnvWrapper::NewWritableFile(f, r, soptions);
+  }
+
+ private:
+  int num_new_writable_file_;
+};
+
+class ColumnFamilyTest {
+ public:
+  ColumnFamilyTest() : rnd_(139) {
+    env_ = new EnvCounter(Env::Default());
+    dbname_ = test::TmpDir() + "/column_family_test";
+    db_options_.create_if_missing = true;
+    db_options_.env = env_;
+    DestroyDB(dbname_, Options(db_options_, column_family_options_));
+  }
+
+  ~ColumnFamilyTest() {
+    delete env_;
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    names_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status TryOpen(std::vector<std::string> cf,
+                 std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    names_.clear();
+    for (size_t i = 0; i < cf.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+      names_.push_back(cf[i]);
+    }
+    return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status OpenReadOnly(std::vector<std::string> cf,
+                         std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    names_.clear();
+    for (size_t i = 0; i < cf.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+      names_.push_back(cf[i]);
+    }
+    return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
+                               &db_);
+  }
+
+  void AssertOpenReadOnly(std::vector<std::string> cf,
+                    std::vector<ColumnFamilyOptions> options = {}) {
+    ASSERT_OK(OpenReadOnly(cf, options));
+  }
+
+
+  void Open(std::vector<std::string> cf,
+            std::vector<ColumnFamilyOptions> options = {}) {
+    ASSERT_OK(TryOpen(cf, options));
+  }
+
+  void Open() {
+    Open({"default"});
+  }
+
+  DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
+
+  int GetProperty(int cf, std::string property) {
+    std::string value;
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
+    return std::stoi(value);
+  }
+
+  void Destroy() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    names_.clear();
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
+  }
+
+  void CreateColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<ColumnFamilyOptions> options = {}) {
+    int cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    names_.resize(cfi + cfs.size());
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      ASSERT_OK(db_->CreateColumnFamily(
+          options.size() == 0 ? column_family_options_ : options[i], cfs[i],
+          &handles_[cfi]));
+      names_[cfi] = cfs[i];
+      cfi++;
+    }
+  }
+
+  void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<std::string> names;
+    for (auto name : names_) {
+      if (name != "") {
+        names.push_back(name);
+      }
+    }
+    Close();
+    assert(options.size() == 0 || names.size() == options.size());
+    Open(names, options);
+  }
+
+  void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
+    CreateColumnFamilies(cfs);
+    Reopen();
+  }
+
+  void DropColumnFamilies(const std::vector<int>& cfs) {
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
+      delete handles_[cf];
+      handles_[cf] = nullptr;
+      names_[cf] = "";
+    }
+  }
+
+  void PutRandomData(int cf, int num, int key_value_size) {
+    for (int i = 0; i < num; ++i) {
+      // 10 bytes for key, rest is value
+      ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 10),
+                    RandomString(&rnd_, key_value_size - 10)));
+    }
+  }
+
+  void WaitForFlush(int cf) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+  }
+
+  void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
+
+  Status Put(int cf, const std::string& key, const std::string& value) {
+    return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+  }
+  Status Merge(int cf, const std::string& key, const std::string& value) {
+    return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+  }
+  Status Flush(int cf) {
+    return db_->Flush(FlushOptions(), handles_[cf]);
+  }
+
+  std::string Get(int cf, const std::string& key) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], Slice(key), &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  void CompactAll(int cf) {
+    ASSERT_OK(db_->CompactRange(handles_[cf], nullptr, nullptr));
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
+  }
+
+  int NumTableFilesAtLevel(int level, int cf) {
+    return GetProperty(cf,
+                       "rocksdb.num-files-at-level" + std::to_string(level));
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf) {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
+      int f = NumTableFilesAtLevel(level, cf);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  int CountLiveFiles() {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    return static_cast<int>(metadata.size());
+  }
+
+  // Do n memtable flushes, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int cf, int n, const std::string& small,
+                  const std::string& large) {
+    for (int i = 0; i < n; i++) {
+      ASSERT_OK(Put(cf, small, "begin"));
+      ASSERT_OK(Put(cf, large, "end"));
+      ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
+    }
+  }
+
+  int CountLiveLogFiles() {
+    int micros_wait_for_log_deletion = 20000;
+    env_->SleepForMicroseconds(micros_wait_for_log_deletion);
+    int ret = 0;
+    VectorLogPtr wal_files;
+    Status s;
+    // GetSortedWalFiles is a flakey function -- it gets all the wal_dir
+    // children files and then later checks for their existance. if some of the
+    // log files doesn't exist anymore, it reports an error. it does all of this
+    // without DB mutex held, so if a background process deletes the log file
+    // while the function is being executed, it returns an error. We retry the
+    // function 10 times to avoid the error failing the test
+    for (int retries = 0; retries < 10; ++retries) {
+      wal_files.clear();
+      s = db_->GetSortedWalFiles(wal_files);
+      if (s.ok()) {
+        break;
+      }
+    }
+    ASSERT_OK(s);
+    for (const auto& wal : wal_files) {
+      if (wal->Type() == kAliveLogFile) {
+        ++ret;
+      }
+    }
+    return ret;
+  }
+
+  void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
+    assert(num_per_cf.size() == handles_.size());
+
+    for (size_t i = 0; i < num_per_cf.size(); ++i) {
+      ASSERT_EQ(num_per_cf[i],
+                GetProperty(i, "rocksdb.num-immutable-mem-table"));
+    }
+  }
+
+  void CopyFile(const std::string& source, const std::string& destination,
+                uint64_t size = 0) {
+    const EnvOptions soptions;
+    unique_ptr<SequentialFile> srcfile;
+    ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+    unique_ptr<WritableFile> destfile;
+    ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+    if (size == 0) {
+      // default argument means copy everything
+      ASSERT_OK(env_->GetFileSize(source, &size));
+    }
+
+    char buffer[4096];
+    Slice slice;
+    while (size > 0) {
+      uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+      ASSERT_OK(srcfile->Read(one, &slice, buffer));
+      ASSERT_OK(destfile->Append(slice));
+      size -= slice.size();
+    }
+    ASSERT_OK(destfile->Close());
+  }
+
+  std::vector<ColumnFamilyHandle*> handles_;
+  std::vector<std::string> names_;
+  ColumnFamilyOptions column_family_options_;
+  DBOptions db_options_;
+  std::string dbname_;
+  DB* db_ = nullptr;
+  EnvCounter* env_;
+  Random rnd_;
+};
+
+TEST(ColumnFamilyTest, DontReuseColumnFamilyID) {
+  for (int iter = 0; iter < 3; ++iter) {
+    Open();
+    CreateColumnFamilies({"one", "two", "three"});
+    for (size_t i = 0; i < handles_.size(); ++i) {
+      auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(i, cfh->GetID());
+    }
+    if (iter == 1) {
+      Reopen();
+    }
+    DropColumnFamilies({3});
+    Reopen();
+    if (iter == 2) {
+      // this tests if max_column_family is correctly persisted with
+      // WriteSnapshot()
+      Reopen();
+    }
+    CreateColumnFamilies({"three2"});
+    // ID 3 that was used for dropped column family "three" should not be reused
+    auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
+    ASSERT_EQ(4U, cfh3->GetID());
+    Close();
+    Destroy();
+  }
+}
+
+
+TEST(ColumnFamilyTest, AddDrop) {
+  Open();
+  CreateColumnFamilies({"one", "two", "three"});
+  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+  ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
+  DropColumnFamilies({2});
+  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+  CreateColumnFamilies({"four"});
+  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+  Close();
+  ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
+  Open({"default", "one", "three", "four"});
+  DropColumnFamilies({1});
+  Reopen();
+  Close();
+
+  std::vector<std::string> families;
+  ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+  sort(families.begin(), families.end());
+  ASSERT_TRUE(families ==
+              std::vector<std::string>({"default", "four", "three"}));
+}
+
+TEST(ColumnFamilyTest, DropTest) {
+  // first iteration - dont reopen DB before dropping
+  // second iteration - reopen DB before dropping
+  for (int iter = 0; iter < 2; ++iter) {
+    Open({"default"});
+    CreateColumnFamiliesAndReopen({"pikachu"});
+    for (int i = 0; i < 100; ++i) {
+      ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush(1));
+
+    if (iter == 1) {
+      Reopen();
+    }
+    ASSERT_EQ("bar1", Get(1, "1"));
+
+    ASSERT_EQ(CountLiveFiles(), 1);
+    DropColumnFamilies({1});
+    // make sure that all files are deleted when we drop the column family
+    ASSERT_EQ(CountLiveFiles(), 0);
+    Destroy();
+  }
+}
+
+TEST(ColumnFamilyTest, WriteBatchFailure) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  WriteBatch batch;
+  batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  DropColumnFamilies({1});
+  Status s = db_->Write(WriteOptions(), &batch);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  Close();
+}
+
+TEST(ColumnFamilyTest, ReadWrite) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  ASSERT_OK(Put(0, "foo", "v1"));
+  ASSERT_OK(Put(0, "bar", "v2"));
+  ASSERT_OK(Put(1, "mirko", "v3"));
+  ASSERT_OK(Put(0, "foo", "v2"));
+  ASSERT_OK(Put(2, "fodor", "v5"));
+
+  for (int iter = 0; iter <= 3; ++iter) {
+    ASSERT_EQ("v2", Get(0, "foo"));
+    ASSERT_EQ("v2", Get(0, "bar"));
+    ASSERT_EQ("v3", Get(1, "mirko"));
+    ASSERT_EQ("v5", Get(2, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+    if (iter <= 1) {
+      Reopen();
+    }
+  }
+  Close();
+}
+
+TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
+  std::string backup_logs = dbname_ + "/backup_logs";
+
+  // delete old files in backup_logs directory
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+  ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+  std::vector<std::string> old_files;
+  env_->GetChildren(backup_logs, &old_files);
+  for (auto& file : old_files) {
+    if (file != "." && file != "..") {
+      env_->DeleteFile(backup_logs + "/" + file);
+    }
+  }
+
+  column_family_options_.merge_operator =
+      MergeOperators::CreateUInt64AddOperator();
+  db_options_.wal_dir = dbname_ + "/logs";
+  Destroy();
+  Open();
+  CreateColumnFamilies({"cf1", "cf2"});
+
+  // fill up the DB
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  ASSERT_OK(Merge(0, "foo", one));
+  ASSERT_OK(Merge(1, "mirko", one));
+  ASSERT_OK(Merge(0, "foo", one));
+  ASSERT_OK(Merge(2, "bla", one));
+  ASSERT_OK(Merge(2, "fodor", one));
+  ASSERT_OK(Merge(0, "bar", one));
+  ASSERT_OK(Merge(2, "bla", one));
+  ASSERT_OK(Merge(1, "mirko", two));
+  ASSERT_OK(Merge(1, "franjo", one));
+
+  // copy the logs to backup
+  std::vector<std::string> logs;
+  env_->GetChildren(db_options_.wal_dir, &logs);
+  for (auto& log : logs) {
+    if (log != ".." && log != ".") {
+      CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
+    }
+  }
+
+  // recover the DB
+  Close();
+
+  // 1. check consistency
+  // 2. copy the logs from backup back to WAL dir. if the recovery happens
+  // again on the same log files, this should lead to incorrect results
+  // due to applying merge operator twice
+  // 3. check consistency
+  for (int iter = 0; iter < 2; ++iter) {
+    // assert consistency
+    Open({"default", "cf1", "cf2"});
+    ASSERT_EQ(two, Get(0, "foo"));
+    ASSERT_EQ(one, Get(0, "bar"));
+    ASSERT_EQ(three, Get(1, "mirko"));
+    ASSERT_EQ(one, Get(1, "franjo"));
+    ASSERT_EQ(one, Get(2, "fodor"));
+    ASSERT_EQ(two, Get(2, "bla"));
+    Close();
+
+    if (iter == 0) {
+      // copy the logs from backup back to wal dir
+      for (auto& log : logs) {
+        if (log != ".." && log != ".") {
+          CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
+        }
+      }
+    }
+  }
+}
+
+TEST(ColumnFamilyTest, FlushTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  ASSERT_OK(Put(0, "foo", "v1"));
+  ASSERT_OK(Put(0, "bar", "v2"));
+  ASSERT_OK(Put(1, "mirko", "v3"));
+  ASSERT_OK(Put(0, "foo", "v2"));
+  ASSERT_OK(Put(2, "fodor", "v5"));
+  for (int i = 0; i < 3; ++i) {
+    Flush(i);
+  }
+  Reopen();
+
+  for (int iter = 0; iter <= 2; ++iter) {
+    ASSERT_EQ("v2", Get(0, "foo"));
+    ASSERT_EQ("v2", Get(0, "bar"));
+    ASSERT_EQ("v3", Get(1, "mirko"));
+    ASSERT_EQ("v5", Get(2, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+    if (iter <= 1) {
+      Reopen();
+    }
+  }
+  Close();
+}
+
+// Makes sure that obsolete log files get deleted
+TEST(ColumnFamilyTest, LogDeletionTest) {
+  db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  column_family_options_.write_buffer_size = 100000;  // 100KB
+  Open();
+  CreateColumnFamilies({"one", "two", "three", "four"});
+  // Each bracket is one log file. if number is in (), it means
+  // we don't need it anymore (it's been flushed)
+  // []
+  ASSERT_EQ(CountLiveLogFiles(), 0);
+  PutRandomData(0, 1, 100);
+  // [0]
+  PutRandomData(1, 1, 100);
+  // [0, 1]
+  PutRandomData(1, 1000, 100);
+  WaitForFlush(1);
+  // [0, (1)] [1]
+  ASSERT_EQ(CountLiveLogFiles(), 2);
+  PutRandomData(0, 1, 100);
+  // [0, (1)] [0, 1]
+  ASSERT_EQ(CountLiveLogFiles(), 2);
+  PutRandomData(2, 1, 100);
+  // [0, (1)] [0, 1, 2]
+  PutRandomData(2, 1000, 100);
+  WaitForFlush(2);
+  // [0, (1)] [0, 1, (2)] [2]
+  ASSERT_EQ(CountLiveLogFiles(), 3);
+  PutRandomData(2, 1000, 100);
+  WaitForFlush(2);
+  // [0, (1)] [0, 1, (2)] [(2)] [2]
+  ASSERT_EQ(CountLiveLogFiles(), 4);
+  PutRandomData(3, 1, 100);
+  // [0, (1)] [0, 1, (2)] [(2)] [2, 3]
+  PutRandomData(1, 1, 100);
+  // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
+  ASSERT_EQ(CountLiveLogFiles(), 4);
+  PutRandomData(1, 1000, 100);
+  WaitForFlush(1);
+  // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
+  ASSERT_EQ(CountLiveLogFiles(), 5);
+  PutRandomData(0, 1000, 100);
+  WaitForFlush(0);
+  // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
+  // delete obsolete logs -->
+  // [(1), 2, 3] [1, (0)] [0]
+  ASSERT_EQ(CountLiveLogFiles(), 3);
+  PutRandomData(0, 1000, 100);
+  WaitForFlush(0);
+  // [(1), 2, 3] [1, (0)], [(0)] [0]
+  ASSERT_EQ(CountLiveLogFiles(), 4);
+  PutRandomData(1, 1000, 100);
+  WaitForFlush(1);
+  // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
+  ASSERT_EQ(CountLiveLogFiles(), 5);
+  PutRandomData(2, 1000, 100);
+  WaitForFlush(2);
+  // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
+  ASSERT_EQ(CountLiveLogFiles(), 6);
+  PutRandomData(3, 1000, 100);
+  WaitForFlush(3);
+  // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
+  // delete obsolete logs -->
+  // [0, (1)] [1, (2)], [2, (3)] [3]
+  ASSERT_EQ(CountLiveLogFiles(), 4);
+  Close();
+}
+
+// Makes sure that obsolete log files get deleted
+TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
+  // disable flushing stale column families
+  db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  Open();
+  CreateColumnFamilies({"one", "two", "three"});
+  ColumnFamilyOptions default_cf, one, two, three;
+  // setup options. all column families have max_write_buffer_number setup to 10
+  // "default" -> 100KB memtable, start flushing immediatelly
+  // "one" -> 200KB memtable, start flushing with two immutable memtables
+  // "two" -> 1MB memtable, start flushing with three immutable memtables
+  // "three" -> 90KB memtable, start flushing with four immutable memtables
+  default_cf.write_buffer_size = 100000;
+  default_cf.max_write_buffer_number = 10;
+  default_cf.min_write_buffer_number_to_merge = 1;
+  one.write_buffer_size = 200000;
+  one.max_write_buffer_number = 10;
+  one.min_write_buffer_number_to_merge = 2;
+  two.write_buffer_size = 1000000;
+  two.max_write_buffer_number = 10;
+  two.min_write_buffer_number_to_merge = 3;
+  three.write_buffer_size = 90000;
+  three.max_write_buffer_number = 10;
+  three.min_write_buffer_number_to_merge = 4;
+
+  Reopen({default_cf, one, two, three});
+
+  int micros_wait_for_flush = 10000;
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 1);
+  PutRandomData(1, 200, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 2);
+  PutRandomData(2, 1000, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 1, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 3);
+  PutRandomData(2, 1000, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 4);
+  PutRandomData(3, 90, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 1});
+  ASSERT_EQ(CountLiveLogFiles(), 5);
+  PutRandomData(3, 90, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 2});
+  ASSERT_EQ(CountLiveLogFiles(), 6);
+  PutRandomData(3, 90, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+  ASSERT_EQ(CountLiveLogFiles(), 7);
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+  ASSERT_EQ(CountLiveLogFiles(), 8);
+  PutRandomData(2, 100, 10000);
+  WaitForFlush(2);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 3});
+  ASSERT_EQ(CountLiveLogFiles(), 9);
+  PutRandomData(3, 90, 1000);
+  WaitForFlush(3);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 10);
+  PutRandomData(3, 90, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 1});
+  ASSERT_EQ(CountLiveLogFiles(), 11);
+  PutRandomData(1, 200, 1000);
+  WaitForFlush(1);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 1});
+  ASSERT_EQ(CountLiveLogFiles(), 5);
+  PutRandomData(3, 90*6, 1000);
+  WaitForFlush(3);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 12);
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 12);
+  PutRandomData(2, 3*100, 10000);
+  WaitForFlush(2);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 12);
+  PutRandomData(1, 2*200, 1000);
+  WaitForFlush(1);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  ASSERT_EQ(CountLiveLogFiles(), 7);
+  Close();
+}
+
+TEST(ColumnFamilyTest, DifferentMergeOperators) {
+  Open();
+  CreateColumnFamilies({"first", "second"});
+  ColumnFamilyOptions default_cf, first, second;
+  first.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  second.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen({default_cf, first, second});
+
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+
+  ASSERT_OK(Put(0, "foo", two));
+  ASSERT_OK(Put(0, "foo", one));
+  ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
+  ASSERT_EQ(Get(0, "foo"), one);
+
+  ASSERT_OK(Put(1, "foo", two));
+  ASSERT_OK(Put(1, "foo", one));
+  ASSERT_OK(Merge(1, "foo", two));
+  ASSERT_EQ(Get(1, "foo"), three);
+
+  ASSERT_OK(Put(2, "foo", two));
+  ASSERT_OK(Put(2, "foo", one));
+  ASSERT_OK(Merge(2, "foo", two));
+  ASSERT_EQ(Get(2, "foo"), one + "," + two);
+  Close();
+}
+
+TEST(ColumnFamilyTest, DifferentCompactionStyles) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.disableDataSync = true;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.filter_policy = nullptr;
+  default_cf.no_block_cache = true;
+  default_cf.source_compaction_factor = 100;
+  default_cf.disable_seek_compaction = false;
+
+  one.compaction_style = kCompactionStyleUniversal;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 100000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.max_mem_compaction_level = 0;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+
+  // SETUP column family "default" - test read compaction
+  ASSERT_EQ("", FilesPerLevel(0));
+  PutRandomData(0, 1, 4096);
+  ASSERT_OK(Flush(0));
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+  // write 8MB
+  PutRandomData(0, 2000, 4096);
+  ASSERT_OK(Flush(0));
+  // clear levels 0 and 1
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]);
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);
+  // write some new keys into level 0 and 1
+  PutRandomData(0, 1024, 512);
+  ASSERT_OK(Flush(0));
+  WaitForCompaction();
+  PutRandomData(0, 10, 512);
+  ASSERT_OK(Flush(0));
+  // remember number of files in each level
+  int l1 = NumTableFilesAtLevel(0, 0);
+  int l2 = NumTableFilesAtLevel(1, 0);
+  int l3 = NumTableFilesAtLevel(2, 0);
+  ASSERT_NE(l1, 0);
+  ASSERT_NE(l2, 0);
+  ASSERT_NE(l3, 0);
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
+    PutRandomData(1, 11, 10000);
+    WaitForFlush(1);
+    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
+  }
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
+    PutRandomData(2, 15, 10000);
+    WaitForFlush(2);
+    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
+  }
+
+  // TRIGGER compaction "default"
+  // read a bunch of times, trigger read compaction
+  for (int i = 0; i < 200000; ++i) {
+    Get(0, std::to_string(i));
+  }
+
+  // TRIGGER compaction "one"
+  PutRandomData(1, 12, 10000);
+
+  // TRIGGER compaction "two"
+  PutRandomData(2, 10, 10000);
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "default"
+  // verify that the number of files have decreased
+  // in some level, indicating that there was a compaction
+  ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 ||
+              NumTableFilesAtLevel(1, 0) < l2 ||
+              NumTableFilesAtLevel(2, 0) < l3);
+
+  // VERIFY compaction "one"
+  ASSERT_EQ("1", FilesPerLevel(1));
+
+  // VERIFY compaction "two"
+  ASSERT_EQ("0,1", FilesPerLevel(2));
+  CompactAll(2);
+  ASSERT_EQ("0,1", FilesPerLevel(2));
+
+  Close();
+}
+
+namespace {
+std::string IterStatus(Iterator* iter) {
+  std::string result;
+  if (iter->Valid()) {
+    result = iter->key().ToString() + "->" + iter->value().ToString();
+  } else {
+    result = "(invalid)";
+  }
+  return result;
+}
+}  // anonymous namespace
+
+TEST(ColumnFamilyTest, NewIteratorsTest) {
+  // iter == 0 -- no tailing
+  // iter == 2 -- tailing
+  for (int iter = 0; iter < 2; ++iter) {
+    Open();
+    CreateColumnFamiliesAndReopen({"one", "two"});
+    ASSERT_OK(Put(0, "a", "b"));
+    ASSERT_OK(Put(1, "b", "a"));
+    ASSERT_OK(Put(2, "c", "m"));
+    ASSERT_OK(Put(2, "v", "t"));
+    std::vector<Iterator*> iterators;
+    ReadOptions options;
+    options.tailing = (iter == 1);
+    ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
+
+    for (auto it : iterators) {
+      it->SeekToFirst();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+    ASSERT_EQ(IterStatus(iterators[1]), "b->a");
+    ASSERT_EQ(IterStatus(iterators[2]), "c->m");
+
+    ASSERT_OK(Put(1, "x", "x"));
+
+    for (auto it : iterators) {
+      it->Next();
+    }
+
+    ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+    if (iter == 0) {
+      // no tailing
+      ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+    } else {
+      // tailing
+      ASSERT_EQ(IterStatus(iterators[1]), "x->x");
+    }
+    ASSERT_EQ(IterStatus(iterators[2]), "v->t");
+
+    for (auto it : iterators) {
+      delete it;
+    }
+    Destroy();
+  }
+}
+
+TEST(ColumnFamilyTest, ReadOnlyDBTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+  ASSERT_OK(Put(1, "foo", "bla"));
+  ASSERT_OK(Put(2, "foo", "blabla"));
+  ASSERT_OK(Put(3, "foo", "blablabla"));
+  ASSERT_OK(Put(4, "foo", "blablablabla"));
+
+  DropColumnFamilies({2});
+  Close();
+  // open only a subset of column families
+  AssertOpenReadOnly({"default", "one", "four"});
+  ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+  ASSERT_EQ("bla", Get(1, "foo"));
+  ASSERT_EQ("blablablabla", Get(2, "foo"));
+
+  Close();
+  // can't open dropped column family
+  Status s = OpenReadOnly({"default", "one", "two"});
+  ASSERT_TRUE(!s.ok());
+
+  // Can't open without specifying default column family
+  s = OpenReadOnly({"one", "four"});
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST(ColumnFamilyTest, DontRollEmptyLogs) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+
+  for (size_t i = 0; i < handles_.size(); ++i) {
+    PutRandomData(i, 10, 100);
+  }
+  int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
+  // this will trigger the flushes
+  ASSERT_OK(db_->Write(WriteOptions(), nullptr));
+
+  for (int i = 0; i < 4; ++i) {
+    dbfull()->TEST_WaitForFlushMemTable(handles_[i]);
+  }
+  int total_new_writable_files =
+      env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start;
+  ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
+  Close();
+}
+
+TEST(ColumnFamilyTest, FlushStaleColumnFamilies) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  default_cf.write_buffer_size = 100000;  // small write buffer size
+  default_cf.disable_auto_compactions = true;
+  one.disable_auto_compactions = true;
+  two.disable_auto_compactions = true;
+  db_options_.max_total_wal_size = 210000;
+
+  Reopen({default_cf, one, two});
+
+  PutRandomData(2, 1, 10);  // 10 bytes
+  for (int i = 0; i < 2; ++i) {
+    PutRandomData(0, 100, 1000);  // flush
+    WaitForFlush(0);
+    ASSERT_EQ(i + 1, CountLiveFiles());
+  }
+  // third flush. now, CF [two] should be detected as stale and flushed
+  // column family 1 should not be flushed since it's empty
+  PutRandomData(0, 100, 1000);  // flush
+  WaitForFlush(0);
+  WaitForFlush(2);
+  // 3 files for default column families, 1 file for column family [two], zero
+  // files for column family [one], because it's empty
+  ASSERT_EQ(4, CountLiveFiles());
+  Close();
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -0,0 +1,253 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <vector>
+
+#include "db/column_family.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+Compaction::Compaction(Version* input_version, int level, int out_level,
+                       uint64_t target_file_size,
+                       uint64_t max_grandparent_overlap_bytes,
+                       bool seek_compaction, bool enable_compression,
+                       bool deletion_compaction)
+    : level_(level),
+      out_level_(out_level),
+      max_output_file_size_(target_file_size),
+      max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
+      input_version_(input_version),
+      number_levels_(input_version_->NumberLevels()),
+      cfd_(input_version_->cfd_),
+      seek_compaction_(seek_compaction),
+      enable_compression_(enable_compression),
+      deletion_compaction_(deletion_compaction),
+      grandparent_index_(0),
+      seen_key_(false),
+      overlapped_bytes_(0),
+      base_index_(-1),
+      parent_index_(-1),
+      score_(0),
+      bottommost_level_(false),
+      is_full_compaction_(false),
+      is_manual_compaction_(false),
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+
+  cfd_->Ref();
+  input_version_->Ref();
+  edit_ = new VersionEdit();
+  edit_->SetColumnFamily(cfd_->GetID());
+  for (int i = 0; i < number_levels_; i++) {
+    level_ptrs_[i] = 0;
+  }
+}
+
+Compaction::~Compaction() {
+  delete edit_;
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+  }
+  if (cfd_ != nullptr) {
+    if (cfd_->Unref()) {
+      delete cfd_;
+    }
+  }
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  // If level_== out_level_, the purpose is to force compaction filter to be
+  // applied to that level, and thus cannot be a trivia move.
+  return (level_ != out_level_ &&
+          num_input_files(0) == 1 &&
+          num_input_files(1) == 0 &&
+          TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_);
+}
+
+bool Compaction::IsDeletionCompaction() const { return deletion_compaction_; }
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+  for (int which = 0; which < 2; which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+    }
+  }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+  assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
+  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+    return bottommost_level_;
+  }
+  // Maybe use binary search to find right entry instead of linear search?
+  const Comparator* user_cmp = cfd_->user_comparator();
+  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
+    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+    for (; level_ptrs_[lvl] < files.size(); ) {
+      FileMetaData* f = files[level_ptrs_[lvl]];
+      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+        // We've advanced far enough
+        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+          // Key falls in this file's range, so definitely not base level
+          return false;
+        }
+        break;
+      }
+      level_ptrs_[lvl]++;
+    }
+  }
+  return true;
+}
+
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+  // Scan to find earliest grandparent file that contains key.
+  const InternalKeyComparator* icmp = &cfd_->internal_comparator();
+  while (grandparent_index_ < grandparents_.size() &&
+      icmp->Compare(internal_key,
+                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
+    if (seen_key_) {
+      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+    }
+    assert(grandparent_index_ + 1 >= grandparents_.size() ||
+           icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
+                         grandparents_[grandparent_index_+1]->smallest.Encode())
+                         < 0);
+    grandparent_index_++;
+  }
+  seen_key_ = true;
+
+  if (overlapped_bytes_ > max_grandparent_overlap_bytes_) {
+    // Too much overlap for current output; start new output
+    overlapped_bytes_ = 0;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool value) {
+  for (int i = 0; i < 2; i++) {
+    std::vector<FileMetaData*> v = inputs_[i];
+    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
+      assert(value ? !inputs_[i][j]->being_compacted :
+                      inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = value;
+    }
+  }
+}
+
+// Is this compaction producing files at the bottommost level?
+void Compaction::SetupBottomMostLevel(bool isManual) {
+  assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
+  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+    // If universal compaction style is used and manual
+    // compaction is occuring, then we are guaranteed that
+    // all files will be picked in a single compaction
+    // run. We can safely set bottommost_level_ = true.
+    // If it is not manual compaction, then bottommost_level_
+    // is already set when the Compaction was created.
+    if (isManual) {
+      bottommost_level_ = true;
+    }
+    return;
+  }
+  bottommost_level_ = true;
+  for (int i = output_level() + 1; i < number_levels_; i++) {
+    if (input_version_->NumLevelFiles(i) > 0) {
+      bottommost_level_ = false;
+      break;
+    }
+  }
+}
+
+void Compaction::ReleaseInputs() {
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+    input_version_ = nullptr;
+  }
+  if (cfd_ != nullptr) {
+    if (cfd_->Unref()) {
+      delete cfd_;
+    }
+    cfd_ = nullptr;
+  }
+}
+
+void Compaction::ReleaseCompactionFiles(Status status) {
+  cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
+}
+
+void Compaction::ResetNextCompactionIndex() {
+  input_version_->ResetNextCompactionIndex(level_);
+}
+
+namespace {
+int InputSummary(const std::vector<FileMetaData*>& files, char* output,
+                 int len) {
+  *output = '\0';
+  int write = 0;
+  for (unsigned int i = 0; i < files.size(); i++) {
+    int sz = len - write;
+    int ret;
+    char sztxt[16];
+    AppendHumanBytes(files.at(i)->file_size, sztxt, 16);
+    ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", files.at(i)->number,
+                   sztxt);
+    if (ret < 0 || ret >= sz) break;
+    write += ret;
+  }
+  // if files.size() is non-zero, overwrite the last space
+  return write - !!files.size();
+}
+}  // namespace
+
+void Compaction::Summary(char* output, int len) {
+  int write =
+      snprintf(output, len, "Base version %" PRIu64
+                            " Base level %d, seek compaction:%d, inputs: [",
+               input_version_->GetVersionNumber(), level_, seek_compaction_);
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  write += InputSummary(inputs_[0], output + write, len - write);
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  write += snprintf(output + write, len - write, "], [");
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  write += InputSummary(inputs_[1], output + write, len - write);
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  snprintf(output + write, len - write, "]");
+}
+
+}  // namespace rocksdb
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -0,0 +1,158 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+
+namespace rocksdb {
+
+class Version;
+class ColumnFamilyData;
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+  ~Compaction();
+
+  // Return the level that is being compacted.  Inputs from "level"
+  // will be merged.
+  int level() const { return level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return out_level_; }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return edit_; }
+
+  // "which" must be either 0 or 1
+  int num_input_files(int which) const { return inputs_[which].size(); }
+
+  // Returns input version of the compaction
+  Version* input_version() const { return input_version_; }
+
+  ColumnFamilyData* column_family_data() const { return cfd_; }
+
+  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+  std::vector<FileMetaData*>* inputs(int which) { return &inputs_[which]; }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+  // Whether compression will be enabled for compaction outputs
+  bool enable_compression() const { return enable_compression_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // If true, just delete all files in inputs_[0]
+  bool IsDeletionCompaction() const;
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the information we have available guarantees that
+  // the compaction is producing data in "level+1" for which no data exists
+  // in levels greater than "level+1".
+  bool IsBaseLevelForKey(const Slice& user_key);
+
+  // Returns true iff we should stop building the current output
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key);
+
+  // Release the input version for the compaction, once the compaction
+  // is successful.
+  void ReleaseInputs();
+
+  // Clear all files to indicate that they are not being compacted
+  // Delete this compaction from the list of running compactions.
+  void ReleaseCompactionFiles(Status status);
+
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool BottomMostLevel() { return bottommost_level_; }
+
+  // Does this compaction include all sst files?
+  bool IsFullCompaction() { return is_full_compaction_; }
+
+  // Was this compaction triggered manually by the client?
+  bool IsManualCompaction() { return is_manual_compaction_; }
+
+ private:
+  friend class CompactionPicker;
+  friend class UniversalCompactionPicker;
+  friend class FIFOCompactionPicker;
+  friend class LevelCompactionPicker;
+
+  Compaction(Version* input_version, int level, int out_level,
+             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
+             bool seek_compaction = false, bool enable_compression = true,
+             bool deletion_compaction = false);
+
+  int level_;
+  int out_level_; // levels to which output files are stored
+  uint64_t max_output_file_size_;
+  uint64_t max_grandparent_overlap_bytes_;
+  Version* input_version_;
+  VersionEdit* edit_;
+  int number_levels_;
+  ColumnFamilyData* cfd_;
+
+  bool seek_compaction_;
+  bool enable_compression_;
+  // if true, just delete files in inputs_[0]
+  bool deletion_compaction_;
+
+  // Each compaction reads inputs from "level_" and "level_+1"
+  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+
+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  size_t grandparent_index_;  // Index in grandparent_starts_
+  bool seen_key_;             // Some output key has been seen
+  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
+                              // and grandparent files
+  int base_index_;   // index of the file in files_[level_]
+  int parent_index_; // index of some file with same range in files_[level_+1]
+  double score_;     // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+  // Does this compaction include all sst files?
+  bool is_full_compaction_;
+
+  // Is this compaction requested by the client?
+  bool is_manual_compaction_;
+
+  // level_ptrs_ holds indices into input_version_->levels_: our state
+  // is that we are positioned at one of the file ranges for each
+  // higher level than the ones involved in this compaction (i.e. for
+  // all L >= level_ + 2).
+  std::vector<size_t> level_ptrs_;
+
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool);
+
+  // Initialize whether compaction producing files at the bottommost level
+  void SetupBottomMostLevel(bool isManual);
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+};
+
+}  // namespace rocksdb
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -0,0 +1,960 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction_picker.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <limits>
+#include "util/log_buffer.h"
+#include "util/statistics.h"
+
+namespace rocksdb {
+
+namespace {
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+// Multiple two operands. If they overflow, return op1.
+uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) {
+  if (op1 == 0) {
+    return 0;
+  }
+  if (op2 <= 0) {
+    return op1;
+  }
+  uint64_t casted_op2 = (uint64_t) op2;
+  if (std::numeric_limits<uint64_t>::max() / op1 < casted_op2) {
+    return op1;
+  }
+  return op1 * casted_op2;
+}
+
+}  // anonymous namespace
+
+CompactionPicker::CompactionPicker(const Options* options,
+                                   const InternalKeyComparator* icmp)
+    : compactions_in_progress_(options->num_levels),
+      options_(options),
+      num_levels_(options->num_levels),
+      icmp_(icmp) {
+
+  max_file_size_.reset(new uint64_t[NumberLevels()]);
+  level_max_bytes_.reset(new uint64_t[NumberLevels()]);
+  int target_file_size_multiplier = options_->target_file_size_multiplier;
+  int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
+  for (int i = 0; i < NumberLevels(); i++) {
+    if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
+      max_file_size_[i] = ULLONG_MAX;
+      level_max_bytes_[i] = options_->max_bytes_for_level_base;
+    } else if (i > 1) {
+      max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1],
+                                                target_file_size_multiplier);
+      level_max_bytes_[i] = MultiplyCheckOverflow(
+          MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier),
+          options_->max_bytes_for_level_multiplier_additional[i - 1]);
+    } else {
+      max_file_size_[i] = options_->target_file_size_base;
+      level_max_bytes_[i] = options_->max_bytes_for_level_base;
+    }
+  }
+}
+
+CompactionPicker::~CompactionPicker() {}
+
+void CompactionPicker::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
+  for (int level = 0; level < NumberLevels() - 1; level++) {
+    uint64_t total = 0;
+    for (auto c : compactions_in_progress_[level]) {
+      assert(c->level() == level);
+      for (int i = 0; i < c->num_input_files(0); i++) {
+        total += c->input(0,i)->file_size;
+      }
+    }
+    sizes[level] = total;
+  }
+}
+
+// Clear all files to indicate that they are not being compacted
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+  c->MarkFilesBeingCompacted(false);
+  compactions_in_progress_[c->level()].erase(c);
+  if (!status.ok()) {
+    c->ResetNextCompactionIndex();
+  }
+}
+
+uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const {
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return max_file_size_[level];
+}
+
+uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) {
+  uint64_t result = MaxFileSizeForLevel(level);
+  result *= options_->max_grandparent_overlap_factor;
+  return result;
+}
+
+double CompactionPicker::MaxBytesForLevel(int level) {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  assert(level >= 0);
+  assert(level < NumberLevels());
+  return level_max_bytes_[level];
+}
+
+void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs,
+                                InternalKey* smallest, InternalKey* largest) {
+  assert(!inputs.empty());
+  smallest->Clear();
+  largest->Clear();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    FileMetaData* f = inputs[i];
+    if (i == 0) {
+      *smallest = f->smallest;
+      *largest = f->largest;
+    } else {
+      if (icmp_->Compare(f->smallest, *smallest) < 0) {
+        *smallest = f->smallest;
+      }
+      if (icmp_->Compare(f->largest, *largest) > 0) {
+        *largest = f->largest;
+      }
+    }
+  }
+}
+
+void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
+                                const std::vector<FileMetaData*>& inputs2,
+                                InternalKey* smallest, InternalKey* largest) {
+  std::vector<FileMetaData*> all = inputs1;
+  all.insert(all.end(), inputs2.begin(), inputs2.end());
+  GetRange(all, smallest, largest);
+}
+
+bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
+  // If inputs are empty then there is nothing to expand.
+  if (!c || c->inputs_[0].empty()) {
+    return true;
+  }
+
+  // GetOverlappingInputs will always do the right thing for level-0.
+  // So we don't need to do any expansion if level == 0.
+  if (c->level() == 0) {
+    return true;
+  }
+
+  const int level = c->level();
+  InternalKey smallest, largest;
+
+  // Keep expanding c->inputs_[0] until we are sure that there is a
+  // "clean cut" boundary between the files in input and the surrounding files.
+  // This will ensure that no parts of a key are lost during compaction.
+  int hint_index = -1;
+  size_t old_size;
+  do {
+    old_size = c->inputs_[0].size();
+    GetRange(c->inputs_[0], &smallest, &largest);
+    c->inputs_[0].clear();
+    c->input_version_->GetOverlappingInputs(
+        level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
+  } while(c->inputs_[0].size() > old_size);
+
+  // Get the new range
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  // If, after the expansion, there are files that are already under
+  // compaction, then we must drop/cancel this compaction.
+  int parent_index = -1;
+  if (c->inputs_[0].empty()) {
+    Log(options_->info_log,
+        "[%s] ExpandWhileOverlapping() failure because zero input files",
+        c->column_family_data()->GetName().c_str());
+  }
+  if (c->inputs_[0].empty() || FilesInCompaction(c->inputs_[0]) ||
+      (c->level() != c->output_level() &&
+       ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
+                               &parent_index))) {
+    c->inputs_[0].clear();
+    c->inputs_[1].clear();
+    return false;
+  }
+  return true;
+}
+
+uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) {
+  uint64_t result = MaxFileSizeForLevel(level);
+  result *= options_->expanded_compaction_factor;
+  return result;
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
+  for (unsigned int i = 0; i < files.size(); i++) {
+    if (files[i]->being_compacted) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::ParentRangeInCompaction(Version* version,
+                                               const InternalKey* smallest,
+                                               const InternalKey* largest,
+                                               int level, int* parent_index) {
+  std::vector<FileMetaData*> inputs;
+  assert(level + 1 < NumberLevels());
+
+  version->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
+                                *parent_index, parent_index);
+  return FilesInCompaction(inputs);
+}
+
+// Populates the set of inputs from "level+1" that overlap with "level".
+// Will also attempt to expand "level" if that doesn't expand "level+1"
+// or cause "level" to include a file for compaction that has an overlapping
+// user-key with another file.
+void CompactionPicker::SetupOtherInputs(Compaction* c) {
+  // If inputs are empty, then there is nothing to expand.
+  // If both input and output levels are the same, no need to consider
+  // files at level "level+1"
+  if (c->inputs_[0].empty() || c->level() == c->output_level()) {
+    return;
+  }
+
+  const int level = c->level();
+  InternalKey smallest, largest;
+
+  // Get the range one last time.
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  // Populate the set of next-level files (inputs_[1]) to include in compaction
+  c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
+                                          &c->inputs_[1], c->parent_index_,
+                                          &c->parent_index_);
+
+  // Get entire range covered by compaction
+  InternalKey all_start, all_limit;
+  GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
+  // See if we can further grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up. We also choose NOT
+  // to expand if this would cause "level" to include some entries for some
+  // user key, while excluding other entries for the same user key. This
+  // can happen when one user key spans multiple files.
+  if (!c->inputs_[1].empty()) {
+    std::vector<FileMetaData*> expanded0;
+    c->input_version_->GetOverlappingInputs(
+        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
+    const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
+    const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
+    const uint64_t expanded0_size = TotalFileSize(expanded0);
+    uint64_t limit = ExpandedCompactionByteSizeLimit(level);
+    if (expanded0.size() > c->inputs_[0].size() &&
+        inputs1_size + expanded0_size < limit &&
+        !FilesInCompaction(expanded0) &&
+        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
+      InternalKey new_start, new_limit;
+      GetRange(expanded0, &new_start, &new_limit);
+      std::vector<FileMetaData*> expanded1;
+      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
+                                              &expanded1, c->parent_index_,
+                                              &c->parent_index_);
+      if (expanded1.size() == c->inputs_[1].size() &&
+          !FilesInCompaction(expanded1)) {
+        Log(options_->info_log,
+            "[%s] Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu "
+            "bytes)\n",
+            c->column_family_data()->GetName().c_str(), (unsigned long)level,
+            (unsigned long)(c->inputs_[0].size()),
+            (unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size,
+            (unsigned long)inputs1_size, (unsigned long)(expanded0.size()),
+            (unsigned long)(expanded1.size()), (unsigned long)expanded0_size,
+            (unsigned long)inputs1_size);
+        smallest = new_start;
+        largest = new_limit;
+        c->inputs_[0] = expanded0;
+        c->inputs_[1] = expanded1;
+        GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+      }
+    }
+  }
+
+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2)
+  if (level + 2 < NumberLevels()) {
+    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                            &c->grandparents_);
+  }
+}
+
+
+Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
+                                           int output_level,
+                                           const InternalKey* begin,
+                                           const InternalKey* end,
+                                           InternalKey** compaction_end) {
+  // CompactionPickerFIFO has its own implementation of compact range
+  assert(options_->compaction_style != kCompactionStyleFIFO);
+
+  std::vector<FileMetaData*> inputs;
+  bool covering_the_whole_range = true;
+
+  // All files are 'overlapping' in universal style compaction.
+  // We have to compact the entire range in one shot.
+  if (options_->compaction_style == kCompactionStyleUniversal) {
+    begin = nullptr;
+    end = nullptr;
+  }
+  version->GetOverlappingInputs(input_level, begin, end, &inputs);
+  if (inputs.empty()) {
+    return nullptr;
+  }
+
+  // Avoid compacting too much in one shot in case the range is large.
+  // But we cannot do this for level-0 since level-0 files can overlap
+  // and we must not pick one file and drop another older file if the
+  // two files overlap.
+  if (input_level > 0) {
+    const uint64_t limit =
+        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
+    uint64_t total = 0;
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+      uint64_t s = inputs[i]->file_size;
+      total += s;
+      if (total >= limit) {
+        **compaction_end = inputs[i + 1]->smallest;
+        covering_the_whole_range = false;
+        inputs.resize(i + 1);
+        break;
+      }
+    }
+  }
+  Compaction* c = new Compaction(version, input_level, output_level,
+                                 MaxFileSizeForLevel(output_level),
+                                 MaxGrandParentOverlapBytes(input_level));
+
+  c->inputs_[0] = inputs;
+  if (ExpandWhileOverlapping(c) == false) {
+    delete c;
+    Log(options_->info_log,
+        "[%s] Could not compact due to expansion failure.\n",
+        version->cfd_->GetName().c_str());
+    return nullptr;
+  }
+
+  SetupOtherInputs(c);
+
+  if (covering_the_whole_range) {
+    *compaction_end = nullptr;
+  }
+
+  // These files that are to be manaully compacted do not trample
+  // upon other files because manual compactions are processed when
+  // the system has a max of 1 background compaction thread.
+  c->MarkFilesBeingCompacted(true);
+
+  // Is this compaction creating a file at the bottommost level
+  c->SetupBottomMostLevel(true);
+
+  c->is_manual_compaction_ = true;
+
+  return c;
+}
+
+Compaction* LevelCompactionPicker::PickCompaction(Version* version,
+                                                  LogBuffer* log_buffer) {
+  Compaction* c = nullptr;
+  int level = -1;
+
+  // Compute the compactions needed. It is better to do it here
+  // and also in LogAndApply(), otherwise the values could be stale.
+  std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
+  SizeBeingCompacted(size_being_compacted);
+  version->ComputeCompactionScore(size_being_compacted);
+
+  // We prefer compactions triggered by too much data in a level over
+  // the compactions triggered by seeks.
+  //
+  // Find the compactions by size on all levels.
+  for (int i = 0; i < NumberLevels() - 1; i++) {
+    assert(i == 0 ||
+           version->compaction_score_[i] <= version->compaction_score_[i - 1]);
+    level = version->compaction_level_[i];
+    if ((version->compaction_score_[i] >= 1)) {
+      c = PickCompactionBySize(version, level, version->compaction_score_[i]);
+      if (ExpandWhileOverlapping(c) == false) {
+        delete c;
+        c = nullptr;
+      } else {
+        break;
+      }
+    }
+  }
+
+  // Find compactions needed by seeks
+  FileMetaData* f = version->file_to_compact_;
+  if (c == nullptr && f != nullptr && !f->being_compacted) {
+
+    level = version->file_to_compact_level_;
+    int parent_index = -1;
+
+    // Only allow one level 0 compaction at a time.
+    // Do not pick this file if its parents at level+1 are being compacted.
+    if (level != 0 || compactions_in_progress_[0].empty()) {
+      if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level,
+                                   &parent_index)) {
+        c = new Compaction(version, level, level + 1,
+                           MaxFileSizeForLevel(level + 1),
+                           MaxGrandParentOverlapBytes(level), true);
+        c->inputs_[0].push_back(f);
+        c->parent_index_ = parent_index;
+        c->input_version_->file_to_compact_ = nullptr;
+        if (ExpandWhileOverlapping(c) == false) {
+          return nullptr;
+        }
+      }
+    }
+  }
+
+  if (c == nullptr) {
+    return nullptr;
+  }
+
+  // Two level 0 compaction won't run at the same time, so don't need to worry
+  // about files on level 0 being compacted.
+  if (level == 0) {
+    assert(compactions_in_progress_[0].empty());
+    InternalKey smallest, largest;
+    GetRange(c->inputs_[0], &smallest, &largest);
+    // Note that the next call will discard the file we placed in
+    // c->inputs_[0] earlier and replace it with an overlapping set
+    // which will include the picked file.
+    c->inputs_[0].clear();
+    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
+                                            &c->inputs_[0]);
+
+    // If we include more L0 files in the same compaction run it can
+    // cause the 'smallest' and 'largest' key to get extended to a
+    // larger range. So, re-invoke GetRange to get the new key range
+    GetRange(c->inputs_[0], &smallest, &largest);
+    if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
+                                &c->parent_index_)) {
+      delete c;
+      return nullptr;
+    }
+    assert(!c->inputs_[0].empty());
+  }
+
+  // Setup "level+1" files (inputs_[1])
+  SetupOtherInputs(c);
+
+  // mark all the files that are being compacted
+  c->MarkFilesBeingCompacted(true);
+
+  // Is this compaction creating a file at the bottommost level
+  c->SetupBottomMostLevel(false);
+
+  // remember this currently undergoing compaction
+  compactions_in_progress_[level].insert(c);
+
+  return c;
+}
+
+Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
+                                                        int level,
+                                                        double score) {
+  Compaction* c = nullptr;
+
+  // level 0 files are overlapping. So we cannot pick more
+  // than one concurrent compactions at this level. This
+  // could be made better by looking at key-ranges that are
+  // being compacted at level 0.
+  if (level == 0 && compactions_in_progress_[level].size() == 1) {
+    return nullptr;
+  }
+
+  assert(level >= 0);
+  assert(level + 1 < NumberLevels());
+  c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1),
+                     MaxGrandParentOverlapBytes(level));
+  c->score_ = score;
+
+  // Pick the largest file in this level that is not already
+  // being compacted
+  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
+
+  // record the first file that is not yet compacted
+  int nextIndex = -1;
+
+  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
+       i < file_size.size(); i++) {
+    int index = file_size[i];
+    FileMetaData* f = c->input_version_->files_[level][index];
+
+    // check to verify files are arranged in descending size
+    assert((i == file_size.size() - 1) ||
+           (i >= Version::number_of_files_to_sort_ - 1) ||
+           (f->file_size >=
+            c->input_version_->files_[level][file_size[i + 1]]->file_size));
+
+    // do not pick a file to compact if it is being compacted
+    // from n-1 level.
+    if (f->being_compacted) {
+      continue;
+    }
+
+    // remember the startIndex for the next call to PickCompaction
+    if (nextIndex == -1) {
+      nextIndex = i;
+    }
+
+    // Do not pick this file if its parents at level+1 are being compacted.
+    // Maybe we can avoid redoing this work in SetupOtherInputs
+    int parent_index = -1;
+    if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest,
+                                level, &parent_index)) {
+      continue;
+    }
+    c->inputs_[0].push_back(f);
+    c->base_index_ = index;
+    c->parent_index_ = parent_index;
+    break;
+  }
+
+  if (c->inputs_[0].empty()) {
+    delete c;
+    c = nullptr;
+  }
+
+  // store where to start the iteration in the next call to PickCompaction
+  version->next_file_to_compact_by_size_[level] = nextIndex;
+
+  return c;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+//
+Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
+                                                      LogBuffer* log_buffer) {
+  int level = 0;
+  double score = version->compaction_score_[0];
+
+  if ((version->files_[level].size() <
+       (unsigned int)options_->level0_file_num_compaction_trigger)) {
+    LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n",
+                version->cfd_->GetName().c_str());
+    return nullptr;
+  }
+  Version::FileSummaryStorage tmp;
+  LogToBuffer(log_buffer, "[%s] Universal: candidate files(%zu): %s\n",
+              version->cfd_->GetName().c_str(), version->files_[level].size(),
+              version->LevelFileSummary(&tmp, 0));
+
+  // Check for size amplification first.
+  Compaction* c;
+  if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) !=
+      nullptr) {
+    LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n",
+                version->cfd_->GetName().c_str());
+  } else {
+    // Size amplification is within limits. Try reducing read
+    // amplification while maintaining file size ratios.
+    unsigned int ratio = options_->compaction_options_universal.size_ratio;
+
+    if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX,
+                                            log_buffer)) != nullptr) {
+      LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n",
+                  version->cfd_->GetName().c_str());
+    } else {
+      // Size amplification and file size ratios are within configured limits.
+      // If max read amplification is exceeding configured limits, then force
+      // compaction without looking at filesize ratios and try to reduce
+      // the number of files to fewer than level0_file_num_compaction_trigger.
+      unsigned int num_files = version->files_[level].size() -
+                               options_->level0_file_num_compaction_trigger;
+      if ((c = PickCompactionUniversalReadAmp(
+               version, score, UINT_MAX, num_files, log_buffer)) != nullptr) {
+        LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n",
+                    version->cfd_->GetName().c_str());
+      }
+    }
+  }
+  if (c == nullptr) {
+    return nullptr;
+  }
+  assert(c->inputs_[0].size() > 1);
+
+  // validate that all the chosen files are non overlapping in time
+  FileMetaData* newerfile __attribute__((unused)) = nullptr;
+  for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
+    FileMetaData* f = c->inputs_[0][i];
+    assert (f->smallest_seqno <= f->largest_seqno);
+    assert(newerfile == nullptr ||
+           newerfile->smallest_seqno > f->largest_seqno);
+    newerfile = f;
+  }
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
+
+  // Is the earliest file part of this compaction?
+  int last_index = file_by_time[file_by_time.size()-1];
+  FileMetaData* last_file = c->input_version_->files_[level][last_index];
+  if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
+    c->bottommost_level_ = true;
+  }
+
+  // update statistics
+  MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION,
+              c->inputs_[0].size());
+
+  // mark all the files that are being compacted
+  c->MarkFilesBeingCompacted(true);
+
+  // remember this currently undergoing compaction
+  compactions_in_progress_[level].insert(c);
+
+  // Record whether this compaction includes all sst files.
+  // For now, it is only relevant in universal compaction mode.
+  c->is_full_compaction_ =
+      (c->inputs_[0].size() == c->input_version_->files_[0].size());
+
+  return c;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
+    Version* version, double score, unsigned int ratio,
+    unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) {
+  int level = 0;
+
+  unsigned int min_merge_width =
+    options_->compaction_options_universal.min_merge_width;
+  unsigned int max_merge_width =
+    options_->compaction_options_universal.max_merge_width;
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = version->files_by_size_[level];
+  FileMetaData* f = nullptr;
+  bool done = false;
+  int start_index = 0;
+  unsigned int candidate_count = 0;
+  assert(file_by_time.size() == version->files_[level].size());
+
+  unsigned int max_files_to_compact = std::min(max_merge_width,
+                                       max_number_of_files_to_compact);
+  min_merge_width = std::max(min_merge_width, 2U);
+
+  // Considers a candidate file only if it is smaller than the
+  // total size accumulated so far.
+  for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
+
+    candidate_count = 0;
+
+    // Skip files that are already being compacted
+    for (f = nullptr; loop < file_by_time.size(); loop++) {
+      int index = file_by_time[loop];
+      f = version->files_[level][index];
+
+      if (!f->being_compacted) {
+        candidate_count = 1;
+        break;
+      }
+      LogToBuffer(
+          log_buffer, "[%s] Universal: file %lu[%d] being compacted, skipping",
+          version->cfd_->GetName().c_str(), (unsigned long)f->number, loop);
+      f = nullptr;
+    }
+
+    // This file is not being compacted. Consider it as the
+    // first candidate to be compacted.
+    uint64_t candidate_size =  f != nullptr? f->file_size : 0;
+    if (f != nullptr) {
+      LogToBuffer(
+          log_buffer, "[%s] Universal: Possible candidate file %lu[%d].",
+          version->cfd_->GetName().c_str(), (unsigned long)f->number, loop);
+    }
+
+    // Check if the suceeding files need compaction.
+    for (unsigned int i = loop+1;
+         candidate_count < max_files_to_compact && i < file_by_time.size();
+         i++) {
+      int index = file_by_time[i];
+      FileMetaData* f = version->files_[level][index];
+      if (f->being_compacted) {
+        break;
+      }
+      // Pick files if the total/last candidate file size (increased by the
+      // specified ratio) is still larger than the next candidate file.
+      // candidate_size is the total size of files picked so far with the
+      // default kCompactionStopStyleTotalSize; with
+      // kCompactionStopStyleSimilarSize, it's simply the size of the last
+      // picked file.
+      uint64_t sz = (candidate_size * (100L + ratio)) /100;
+      if (sz < f->file_size) {
+        break;
+      }
+      if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) {
+        // Similar-size stopping rule: also check the last picked file isn't
+        // far larger than the next candidate file.
+        sz = (f->file_size * (100L + ratio)) / 100;
+        if (sz < candidate_size) {
+          // If the small file we've encountered begins a run of similar-size
+          // files, we'll pick them up on a future iteration of the outer
+          // loop. If it's some lonely straggler, it'll eventually get picked
+          // by the last-resort read amp strategy which disregards size ratios.
+          break;
+        }
+        candidate_size = f->file_size;
+      } else { // default kCompactionStopStyleTotalSize
+        candidate_size += f->file_size;
+      }
+      candidate_count++;
+    }
+
+    // Found a series of consecutive files that need compaction.
+    if (candidate_count >= (unsigned int)min_merge_width) {
+      start_index = loop;
+      done = true;
+      break;
+    } else {
+      for (unsigned int i = loop;
+           i < loop + candidate_count && i < file_by_time.size(); i++) {
+       int index = file_by_time[i];
+       FileMetaData* f = version->files_[level][index];
+       LogToBuffer(log_buffer,
+                   "[%s] Universal: Skipping file %lu[%d] with size %lu %d\n",
+                   version->cfd_->GetName().c_str(), (unsigned long)f->number,
+                   i, (unsigned long)f->file_size, f->being_compacted);
+      }
+    }
+  }
+  if (!done || candidate_count <= 1) {
+    return nullptr;
+  }
+  unsigned int first_index_after = start_index + candidate_count;
+  // Compression is enabled if files compacted earlier already reached
+  // size ratio of compression.
+  bool enable_compression = true;
+  int ratio_to_compress =
+      options_->compaction_options_universal.compression_size_percent;
+  if (ratio_to_compress >= 0) {
+    uint64_t total_size = version->NumLevelBytes(level);
+    uint64_t older_file_size = 0;
+    for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
+        i--) {
+      older_file_size += version->files_[level][file_by_time[i]]->file_size;
+      if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
+        enable_compression = false;
+        break;
+      }
+    }
+  }
+  Compaction* c =
+      new Compaction(version, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, enable_compression);
+  c->score_ = score;
+
+  for (unsigned int i = start_index; i < first_index_after; i++) {
+    int index = file_by_time[i];
+    FileMetaData* f = c->input_version_->files_[level][index];
+    c->inputs_[0].push_back(f);
+    LogToBuffer(log_buffer,
+                "[%s] Universal: Picking file %lu[%d] with size %lu\n",
+                version->cfd_->GetName().c_str(), (unsigned long)f->number, i,
+                (unsigned long)f->file_size);
+  }
+  return c;
+}
+
+// Look at overall size amplification. If size amplification
+// exceeeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
+    Version* version, double score, LogBuffer* log_buffer) {
+  int level = 0;
+
+  // percentage flexibilty while reducing size amplification
+  uint64_t ratio = options_->compaction_options_universal.
+                     max_size_amplification_percent;
+
+  // The files are sorted from newest first to oldest last.
+  std::vector<int>& file_by_time = version->files_by_size_[level];
+  assert(file_by_time.size() == version->files_[level].size());
+
+  unsigned int candidate_count = 0;
+  uint64_t candidate_size = 0;
+  unsigned int start_index = 0;
+  FileMetaData* f = nullptr;
+
+  // Skip files that are already being compacted
+  for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
+    int index = file_by_time[loop];
+    f = version->files_[level][index];
+    if (!f->being_compacted) {
+      start_index = loop;         // Consider this as the first candidate.
+      break;
+    }
+    LogToBuffer(log_buffer,
+                "[%s] Universal: skipping file %lu[%d] compacted %s",
+                version->cfd_->GetName().c_str(), (unsigned long)f->number,
+                loop, " cannot be a candidate to reduce size amp.\n");
+    f = nullptr;
+  }
+  if (f == nullptr) {
+    return nullptr;             // no candidate files
+  }
+
+  LogToBuffer(log_buffer, "[%s] Universal: First candidate file %lu[%d] %s",
+              version->cfd_->GetName().c_str(), (unsigned long)f->number,
+              start_index, " to reduce size amp.\n");
+
+  // keep adding up all the remaining files
+  for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
+       loop++) {
+    int index = file_by_time[loop];
+    f = version->files_[level][index];
+    if (f->being_compacted) {
+      LogToBuffer(
+          log_buffer, "[%s] Universal: Possible candidate file %lu[%d] %s.",
+          version->cfd_->GetName().c_str(), (unsigned long)f->number, loop,
+          " is already being compacted. No size amp reduction possible.\n");
+      return nullptr;
+    }
+    candidate_size += f->file_size;
+    candidate_count++;
+  }
+  if (candidate_count == 0) {
+    return nullptr;
+  }
+
+  // size of earliest file
+  int index = file_by_time[file_by_time.size() - 1];
+  uint64_t earliest_file_size = version->files_[level][index]->file_size;
+
+  // size amplification = percentage of additional size
+  if (candidate_size * 100 < ratio * earliest_file_size) {
+    LogToBuffer(
+        log_buffer,
+        "[%s] Universal: size amp not needed. newer-files-total-size %lu "
+        "earliest-file-size %lu",
+        version->cfd_->GetName().c_str(), (unsigned long)candidate_size,
+        (unsigned long)earliest_file_size);
+    return nullptr;
+  } else {
+    LogToBuffer(log_buffer,
+                "[%s] Universal: size amp needed. newer-files-total-size %lu "
+                "earliest-file-size %lu",
+                version->cfd_->GetName().c_str(), (unsigned long)candidate_size,
+                (unsigned long)earliest_file_size);
+  }
+  assert(start_index >= 0 && start_index < file_by_time.size() - 1);
+
+  // create a compaction request
+  // We always compact all the files, so always compress.
+  Compaction* c =
+      new Compaction(version, level, level, MaxFileSizeForLevel(level),
+                     LLONG_MAX, false, true);
+  c->score_ = score;
+  for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
+    int index = file_by_time[loop];
+    f = c->input_version_->files_[level][index];
+    c->inputs_[0].push_back(f);
+    LogToBuffer(log_buffer,
+                "[%s] Universal: size amp picking file %lu[%d] with size %lu",
+                version->cfd_->GetName().c_str(), (unsigned long)f->number,
+                index, (unsigned long)f->file_size);
+  }
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(Version* version,
+                                                 LogBuffer* log_buffer) {
+  assert(version->NumberLevels() == 1);
+  uint64_t total_size = 0;
+  for (const auto& file : version->files_[0]) {
+    total_size += file->file_size;
+  }
+
+  if (total_size <= options_->compaction_options_fifo.max_table_files_size ||
+      version->files_[0].size() == 0) {
+    // total size not exceeded
+    LogToBuffer(log_buffer,
+                "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+                ", max size %" PRIu64 "\n",
+                version->cfd_->GetName().c_str(), total_size,
+                options_->compaction_options_fifo.max_table_files_size);
+    return nullptr;
+  }
+
+  if (compactions_in_progress_[0].size() > 0) {
+    LogToBuffer(log_buffer,
+                "[%s] FIFO compaction: Already executing compaction. No need "
+                "to run parallel compactions since compactions are very fast",
+                version->cfd_->GetName().c_str());
+    return nullptr;
+  }
+
+  Compaction* c = new Compaction(version, 0, 0, 0, 0, false, false,
+                                 true /* is deletion compaction */);
+  // delete old files (FIFO)
+  for (auto ritr = version->files_[0].rbegin();
+       ritr != version->files_[0].rend(); ++ritr) {
+    auto f = *ritr;
+    total_size -= f->file_size;
+    c->inputs_[0].push_back(f);
+    char tmp_fsize[16];
+    AppendHumanBytes(f->file_size, tmp_fsize, sizeof(tmp_fsize));
+    LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64
+                            " with size %s for deletion",
+                version->cfd_->GetName().c_str(), f->number, tmp_fsize);
+    if (total_size <= options_->compaction_options_fifo.max_table_files_size) {
+      break;
+    }
+  }
+
+  c->MarkFilesBeingCompacted(true);
+  compactions_in_progress_[0].insert(c);
+
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::CompactRange(Version* version,
+                                               int input_level,
+                                               int output_level,
+                                               const InternalKey* begin,
+                                               const InternalKey* end,
+                                               InternalKey** compaction_end) {
+  assert(input_level == 0);
+  assert(output_level == 0);
+  *compaction_end = nullptr;
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_->info_log.get());
+  auto c = PickCompaction(version, &log_buffer);
+  log_buffer.FlushBufferToLog();
+  return c;
+}
+
+}  // namespace rocksdb
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -0,0 +1,181 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "db/compaction.h"
+#include "rocksdb/status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/env.h"
+
+#include <vector>
+#include <memory>
+#include <set>
+
+namespace rocksdb {
+
+class LogBuffer;
+class Compaction;
+class Version;
+
+class CompactionPicker {
+ public:
+  CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
+  virtual ~CompactionPicker();
+
+  // Pick level and inputs for a new compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  virtual Compaction* PickCompaction(Version* version,
+                                     LogBuffer* log_buffer) = 0;
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns nullptr if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  virtual Compaction* CompactRange(Version* version, int input_level,
+                                   int output_level, const InternalKey* begin,
+                                   const InternalKey* end,
+                                   InternalKey** compaction_end);
+
+  // Free up the files that participated in a compaction
+  void ReleaseCompactionFiles(Compaction* c, Status status);
+
+  // Return the total amount of data that is undergoing
+  // compactions per level
+  void SizeBeingCompacted(std::vector<uint64_t>& sizes);
+
+  // Returns maximum total overlap bytes with grandparent
+  // level (i.e., level+2) before we stop building a single
+  // file in level->level+1 compaction.
+  uint64_t MaxGrandParentOverlapBytes(int level);
+
+  // Returns maximum total bytes of data on a given level.
+  double MaxBytesForLevel(int level);
+
+  // Get the max file size in a given level.
+  uint64_t MaxFileSizeForLevel(int level) const;
+
+ protected:
+  int NumberLevels() const { return num_levels_; }
+
+  // Stores the minimal range that covers all entries in inputs in
+  // *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const std::vector<FileMetaData*>& inputs, InternalKey* smallest,
+                InternalKey* largest);
+
+  // Stores the minimal range that covers all entries in inputs1 and inputs2
+  // in *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const std::vector<FileMetaData*>& inputs1,
+                const std::vector<FileMetaData*>& inputs2,
+                InternalKey* smallest, InternalKey* largest);
+
+  // Add more files to the inputs on "level" to make sure that
+  // no newer version of a key is compacted to "level+1" while leaving an older
+  // version in a "level". Otherwise, any Get() will search "level" first,
+  // and will likely return an old/stale value for the key, since it always
+  // searches in increasing order of level to find the value. This could
+  // also scramble the order of merge operands. This function should be
+  // called any time a new Compaction is created, and its inputs_[0] are
+  // populated.
+  //
+  // Will return false if it is impossible to apply this compaction.
+  bool ExpandWhileOverlapping(Compaction* c);
+
+  uint64_t ExpandedCompactionByteSizeLimit(int level);
+
+  // Returns true if any one of the specified files are being compacted
+  bool FilesInCompaction(std::vector<FileMetaData*>& files);
+
+  // Returns true if any one of the parent files are being compacted
+  bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
+                               const InternalKey* largest, int level,
+                               int* index);
+
+  void SetupOtherInputs(Compaction* c);
+
+  // record all the ongoing compactions for all levels
+  std::vector<std::set<Compaction*>> compactions_in_progress_;
+
+  // Per-level target file size.
+  std::unique_ptr<uint64_t[]> max_file_size_;
+
+  // Per-level max bytes
+  std::unique_ptr<uint64_t[]> level_max_bytes_;
+
+  const Options* const options_;
+
+ private:
+  int num_levels_;
+
+  const InternalKeyComparator* const icmp_;
+};
+
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+  UniversalCompactionPicker(const Options* options,
+                            const InternalKeyComparator* icmp)
+      : CompactionPicker(options, icmp) {}
+  virtual Compaction* PickCompaction(Version* version,
+                                     LogBuffer* log_buffer) override;
+
+ private:
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
+                                             unsigned int ratio,
+                                             unsigned int num_files,
+                                             LogBuffer* log_buffer);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionUniversalSizeAmp(Version* version, double score,
+                                             LogBuffer* log_buffer);
+};
+
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const Options* options,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(options, icmp) {}
+  virtual Compaction* PickCompaction(Version* version,
+                                     LogBuffer* log_buffer) override;
+
+ private:
+  // For the specfied level, pick a compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return nullptr.
+  Compaction* PickCompactionBySize(Version* version, int level, double score);
+};
+
+class FIFOCompactionPicker : public CompactionPicker {
+ public:
+  FIFOCompactionPicker(const Options* options,
+                       const InternalKeyComparator* icmp)
+      : CompactionPicker(options, icmp) {}
+
+  virtual Compaction* PickCompaction(Version* version,
+                                     LogBuffer* log_buffer) override;
+
+  virtual Compaction* CompactRange(Version* version, int input_level,
+                                   int output_level, const InternalKey* begin,
+                                   const InternalKey* end,
+                                   InternalKey** compaction_end) override;
+};
+
+}  // namespace rocksdb
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -0,0 +1,440 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest {
+ public:
+  test::ErrorEnv env_;
+  std::string dbname_;
+  shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  CorruptionTest() {
+    tiny_cache_ = NewLRUCache(100);
+    options_.env = &env_;
+    dbname_ = test::TmpDir() + "/corruption_test";
+    DestroyDB(dbname_, options_);
+
+    db_ = nullptr;
+    options_.create_if_missing = true;
+    options_.block_size_deviation = 0; // make unit test pass for now
+    Reopen();
+    options_.create_if_missing = false;
+  }
+
+  ~CorruptionTest() {
+     delete db_;
+     DestroyDB(dbname_, Options());
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opt = (options ? *options : options_);
+    opt.env = &env_;
+    opt.block_cache = tiny_cache_;
+    opt.block_size_deviation = 0;
+    opt.arena_block_size = 4096;
+    return DB::Open(opt, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void RepairDB() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
+  }
+
+  void Build(int n) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = 0; i < n; i++) {
+      //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void Check(int min_expected, int max_expected) {
+    unsigned int next_expected = 0;
+    int missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    // Do not verify checksums. If we verify checksums then the
+    // db itself will raise errors because data is corrupted.
+    // Instead, we want the reads to be successful and this test
+    // will detect whether the appropriate corruptions have
+    // occured.
+    Iterator* iter = db_->NewIterator(ReadOptions(false, true));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      uint64_t key;
+      Slice in(iter->key());
+      if (!ConsumeDecimalNumber(&in, &key) ||
+          !in.empty() ||
+          key < next_expected) {
+        bad_keys++;
+        continue;
+      }
+      missed += (key - next_expected);
+      next_expected = key + 1;
+      if (iter->value() != Value(key, &value_space)) {
+        bad_values++;
+      } else {
+        correct++;
+      }
+    }
+    delete iter;
+
+    fprintf(stderr,
+            "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
+            min_expected, max_expected, correct, bad_keys, bad_values, missed);
+    ASSERT_LE(min_expected, correct);
+    ASSERT_GE(max_expected, correct);
+  }
+
+  void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) {
+    struct stat sbuf;
+    if (stat(fname.c_str(), &sbuf) != 0) {
+      const char* msg = strerror(errno);
+      ASSERT_TRUE(false) << fname << ": " << msg;
+    }
+
+    if (offset < 0) {
+      // Relative to end of file; make it absolute
+      if (-offset > sbuf.st_size) {
+        offset = 0;
+      } else {
+        offset = sbuf.st_size + offset;
+      }
+    }
+    if (offset > sbuf.st_size) {
+      offset = sbuf.st_size;
+    }
+    if (offset + bytes_to_corrupt > sbuf.st_size) {
+      bytes_to_corrupt = sbuf.st_size - offset;
+    }
+
+    // Do it
+    std::string contents;
+    Status s = ReadFileToString(Env::Default(), fname, &contents);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    for (int i = 0; i < bytes_to_corrupt; i++) {
+      contents[i + offset] ^= 0x80;
+    }
+    s = WriteStringToFile(Env::Default(), contents, fname);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+  }
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    int picked_number = -1;
+    for (unsigned int i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) &&
+          type == filetype &&
+          static_cast<int>(number) > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = number;
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+
+    CorruptFile(fname, offset, bytes_to_corrupt);
+  }
+
+  // corrupts exactly one file at level `level`. if no file found at level,
+  // asserts
+  void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    for (const auto& m : metadata) {
+      if (m.level == level) {
+        CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
+        return;
+      }
+    }
+    ASSERT_TRUE(false) << "no file found at level";
+  }
+
+
+  int Property(const std::string& name) {
+    std::string property;
+    int result;
+    if (db_->GetProperty(name, &property) &&
+        sscanf(property.c_str(), "%d", &result) == 1) {
+      return result;
+    } else {
+      return -1;
+    }
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    Random r(k);
+    return test::RandomString(&r, kValueSize, storage);
+  }
+};
+
+TEST(CorruptionTest, Recovery) {
+  Build(100);
+  Check(100, 100);
+  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
+  Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
+  Reopen();
+
+  // The 64 records in the first two log blocks are completely lost.
+  Check(36, 36);
+}
+
+TEST(CorruptionTest, RecoverWriteError) {
+  env_.writable_file_error_ = true;
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST(CorruptionTest, NewFileErrorDuringWrite) {
+  // Do enough writing to force minor compaction
+  env_.writable_file_error_ = true;
+  const int num = 3 + (Options().write_buffer_size / kValueSize);
+  std::string value_storage;
+  Status s;
+  bool failed = false;
+  for (int i = 0; i < num; i++) {
+    WriteBatch batch;
+    batch.Put("a", Value(100, &value_storage));
+    s = db_->Write(WriteOptions(), &batch);
+    if (!s.ok()) {
+      failed = true;
+    }
+    ASSERT_TRUE(!failed || !s.ok());
+  }
+  ASSERT_TRUE(!s.ok());
+  ASSERT_GE(env_.num_writable_file_errors_, 1);
+  env_.writable_file_error_ = false;
+  Reopen();
+}
+
+TEST(CorruptionTest, TableFile) {
+  Build(100);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+  Corrupt(kTableFile, 100, 1);
+  Check(99, 99);
+}
+
+TEST(CorruptionTest, TableFileIndexData) {
+  Build(10000);  // Enough to build multiple Tables
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+
+  Corrupt(kTableFile, -2000, 500);
+  Reopen();
+  Check(5000, 9999);
+}
+
+TEST(CorruptionTest, MissingDescriptor) {
+  Build(1000);
+  RepairDB();
+  Reopen();
+  Check(1000, 1000);
+}
+
+TEST(CorruptionTest, SequenceNumberRecovery) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v5", v);
+  // Write something.  If sequence number was not recovered properly,
+  // it will be hidden by an earlier write.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+}
+
+TEST(CorruptionTest, CorruptedDescriptor) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+
+  Corrupt(kDescriptorFile, 0, 1000);
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("hello", v);
+}
+
+TEST(CorruptionTest, CompactionInputError) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  const int last = dbi->MaxMemCompactionLevel();
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last)));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+
+  // Force compactions by writing lots of values
+  Build(10000);
+  Check(10000, 10000);
+}
+
+TEST(CorruptionTest, CompactionInputErrorParanoid) {
+  Options options;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 131072;
+  options.max_write_buffer_number = 2;
+  Reopen(&options);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+  // Fill levels >= 1 so memtable flush outputs to level 0
+  for (int level = 1; level < dbi->NumberLevels(); level++) {
+    dbi->Put(WriteOptions(), "", "begin");
+    dbi->Put(WriteOptions(), "~", "end");
+    dbi->TEST_FlushMemTable();
+  }
+
+  options.max_mem_compaction_level = 0;
+  Reopen(&options);
+
+  dbi = reinterpret_cast<DBImpl*>(db_);
+  Build(10);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_WaitForCompact();
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
+
+  CorruptTableFileAtLevel(0, 100, 1);
+  Check(9, 9);
+
+  // Write must eventually fail because of corrupted table
+  Status s;
+  std::string tmp1, tmp2;
+  bool failed = false;
+  for (int i = 0; i < 10000; i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+    if (!s.ok()) {
+      failed = true;
+    }
+    // if one write failed, every subsequent write must fail, too
+    ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
+  }
+  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST(CorruptionTest, UnrelatedKeys) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  Corrupt(kTableFile, 100, 1);
+
+  std::string tmp1, tmp2;
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+  dbi->TEST_FlushMemTable();
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+TEST(CorruptionTest, FileSystemStateCorrupted) {
+  for (int iter = 0; iter < 2; ++iter) {
+    Options options;
+    options.paranoid_checks = true;
+    options.create_if_missing = true;
+    Reopen(&options);
+    Build(10);
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+    std::vector<LiveFileMetaData> metadata;
+    dbi->GetLiveFilesMetaData(&metadata);
+    ASSERT_GT(metadata.size(), size_t(0));
+    std::string filename = dbname_ + metadata[0].name;
+
+    delete db_;
+    db_ = nullptr;
+
+    if (iter == 0) {  // corrupt file size
+      unique_ptr<WritableFile> file;
+      env_.NewWritableFile(filename, &file, EnvOptions());
+      file->Append(Slice("corrupted sst"));
+      file.reset();
+    } else {  // delete the file
+      env_.DeleteFile(filename);
+    }
+
+    Status x = TryReopen(&options);
+    ASSERT_TRUE(x.IsCorruption());
+    DestroyDB(dbname_, options_);
+    Reopen(&options);
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -0,0 +1,179 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef ROCKSDB_LITE
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <algorithm>
+#include <string>
+#include <stdint.h>
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+Status DBImpl::DisableFileDeletions() {
+  MutexLock l(&mutex_);
+  ++disable_delete_obsolete_files_;
+  if (disable_delete_obsolete_files_ == 1) {
+    Log(options_.info_log, "File Deletions Disabled");
+  } else {
+    Log(options_.info_log,
+        "File Deletions Disabled, but already disabled. Counter: %d",
+        disable_delete_obsolete_files_);
+  }
+  return Status::OK();
+}
+
+Status DBImpl::EnableFileDeletions(bool force) {
+  DeletionState deletion_state;
+  bool should_purge_files = false;
+  {
+    MutexLock l(&mutex_);
+    if (force) {
+      // if force, we need to enable file deletions right away
+      disable_delete_obsolete_files_ = 0;
+    } else if (disable_delete_obsolete_files_ > 0) {
+      --disable_delete_obsolete_files_;
+    }
+    if (disable_delete_obsolete_files_ == 0)  {
+      Log(options_.info_log, "File Deletions Enabled");
+      should_purge_files = true;
+      FindObsoleteFiles(deletion_state, true);
+    } else {
+      Log(options_.info_log,
+          "File Deletions Enable, but not really enabled. Counter: %d",
+          disable_delete_obsolete_files_);
+    }
+  }
+  if (should_purge_files)  {
+    PurgeObsoleteFiles(deletion_state);
+  }
+  LogFlush(options_.info_log);
+  return Status::OK();
+}
+
+Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                            uint64_t* manifest_file_size,
+                            bool flush_memtable) {
+
+  *manifest_file_size = 0;
+
+  mutex_.Lock();
+
+  if (flush_memtable) {
+    // flush all dirty data to disk.
+    Status status;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      cfd->Ref();
+      mutex_.Unlock();
+      status = FlushMemTable(cfd, FlushOptions());
+      mutex_.Lock();
+      cfd->Unref();
+      if (!status.ok()) {
+        break;
+      }
+    }
+    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+
+    if (!status.ok()) {
+      mutex_.Unlock();
+      Log(options_.info_log, "Cannot Flush data %s\n",
+          status.ToString().c_str());
+      return status;
+    }
+  }
+
+  // Make a set of all of the live *.sst files
+  std::set<uint64_t> live;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    cfd->current()->AddLiveFiles(&live);
+  }
+
+  ret.clear();
+  ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
+
+  // create names of the live files. The names are not absolute
+  // paths, instead they are relative to dbname_;
+  for (auto live_file : live) {
+    ret.push_back(TableFileName("", live_file));
+  }
+
+  ret.push_back(CurrentFileName(""));
+  ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
+
+  // find length of manifest file while holding the mutex lock
+  *manifest_file_size = versions_->ManifestFileSize();
+
+  mutex_.Unlock();
+  return Status::OK();
+}
+
+Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+  // First get sorted files in db dir, then get sorted files from archived
+  // dir, to avoid a race condition where a log file is moved to archived
+  // dir in between.
+  Status s;
+  // list wal files in main db dir.
+  VectorLogPtr logs;
+  s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Reproduce the race condition where a log file is moved
+  // to archived dir, between these two sync points, used in
+  // (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
+  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
+
+  files.clear();
+  // list wal files in archive dir.
+  std::string archivedir = ArchivalDirectory(options_.wal_dir);
+  if (env_->FileExists(archivedir)) {
+    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  uint64_t latest_archived_log_number = 0;
+  if (!files.empty()) {
+    latest_archived_log_number = files.back()->LogNumber();
+    Log(options_.info_log, "Latest Archived log: %" PRIu64,
+        latest_archived_log_number);
+  }
+
+  files.reserve(files.size() + logs.size());
+  for (auto& log : logs) {
+    if (log->LogNumber() > latest_archived_log_number) {
+      files.push_back(std::move(log));
+    } else {
+      // When the race condition happens, we could see the
+      // same log in both db dir and archived dir. Simply
+      // ignore the one in db dir. Note that, if we read
+      // archived dir first, we would have missed the log file.
+      Log(options_.info_log, "%s already moved to archive",
+          log->PathName().c_str());
+    }
+  }
+
+  return s;
+}
+
+}
+
+#endif  // ROCKSDB_LITE
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -0,0 +1,635 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "memtable_list.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "util/autovector.h"
+#include "util/stats_logger.h"
+#include "util/thread_local.h"
+#include "db/internal_stats.h"
+
+namespace rocksdb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class CompactionFilterV2;
+class Arena;
+
+class DBImpl : public DB {
+ public:
+  DBImpl(const DBOptions& options, const std::string& dbname);
+  virtual ~DBImpl();
+
+  // Implementations of the DB interface
+  using DB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value);
+  using DB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value);
+  using DB::Delete;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family, const Slice& key);
+  using DB::Write;
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value);
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values);
+
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family,
+                                    ColumnFamilyHandle** handle);
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
+
+  // Returns false if key doesn't exist in the database and true if it may.
+  // If value_found is not passed in as null, then return the value if found in
+  // memory. On return, if value was found, then value_found will be set to true
+  // , otherwise false.
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value, bool* value_found = nullptr);
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family);
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators);
+  virtual const Snapshot* GetSnapshot();
+  virtual void ReleaseSnapshot(const Snapshot* snapshot);
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value);
+  using DB::GetApproximateSizes;
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                   const Range* range, int n, uint64_t* sizes);
+  using DB::CompactRange;
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false, int target_level = -1);
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family);
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family);
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family);
+  virtual const std::string& GetName() const;
+  virtual Env* GetEnv() const;
+  using DB::GetOptions;
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const;
+  using DB::Flush;
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family);
+
+  virtual SequenceNumber GetLatestSequenceNumber() const;
+
+#ifndef ROCKSDB_LITE
+  virtual Status DisableFileDeletions();
+  virtual Status EnableFileDeletions(bool force);
+  // All the returned filenames start with "/"
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true);
+  virtual Status GetSortedWalFiles(VectorLogPtr& files);
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions&
+          read_options = TransactionLogIterator::ReadOptions());
+  virtual Status DeleteFile(std::string name);
+
+  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
+#endif  // ROCKSDB_LITE
+
+  // checks if all live files exist on file system and that their file sizes
+  // match to our in-memory records
+  virtual Status CheckConsistency();
+
+  virtual Status GetDbIdentity(std::string& identity);
+
+  Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
+                             int output_level, const Slice* begin,
+                             const Slice* end);
+
+#ifndef ROCKSDB_LITE
+  // Extra methods (for testing) that are not in the public DB interface
+  // Implemented in db_impl_debug.cc
+
+  // Compact any files in the named level that overlap [*begin, *end]
+  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
+                           ColumnFamilyHandle* column_family = nullptr);
+
+  // Force current memtable contents to be flushed.
+  Status TEST_FlushMemTable(bool wait = true);
+
+  // Wait for memtable compaction
+  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
+
+  // Wait for any compaction
+  Status TEST_WaitForCompact();
+
+  // Return an internal iterator over the current state of the database.
+  // The keys of this iterator are internal keys (see format.h).
+  // The returned iterator should be deleted when no longer needed.
+  Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
+                                         nullptr);
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family =
+                                                nullptr);
+
+  // Return the current manifest file no.
+  uint64_t TEST_Current_Manifest_FileNo();
+
+  // Trigger's a background call for testing.
+  void TEST_PurgeObsoleteteWAL();
+
+  // get total level0 file size. Only for testing.
+  uint64_t TEST_GetLevel0TotalSize();
+
+  void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
+  {
+    default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
+  }
+
+  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
+                             std::vector<std::vector<FileMetaData>>* metadata);
+
+  Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
+                              SequenceNumber* sequence);
+
+  Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
+#endif  // NDEBUG
+
+  // needed for CleanupIteratorState
+  struct DeletionState {
+    inline bool HaveSomethingToDelete() const {
+      return  candidate_files.size() ||
+        sst_delete_files.size() ||
+        log_delete_files.size();
+    }
+
+    // a list of all files that we'll consider deleting
+    // (every once in a while this is filled up with all files
+    // in the DB directory)
+    std::vector<std::string> candidate_files;
+
+    // the list of all live sst files that cannot be deleted
+    std::vector<uint64_t> sst_live;
+
+    // a list of sst files that we need to delete
+    std::vector<FileMetaData*> sst_delete_files;
+
+    // a list of log files that we need to delete
+    std::vector<uint64_t> log_delete_files;
+
+    // a list of memtables to be free
+    autovector<MemTable*> memtables_to_free;
+
+    autovector<SuperVersion*> superversions_to_free;
+
+    SuperVersion* new_superversion;  // if nullptr no new superversion
+
+    // the current manifest_file_number, log_number and prev_log_number
+    // that corresponds to the set of files in 'live'.
+    uint64_t manifest_file_number, pending_manifest_file_number, log_number,
+        prev_log_number;
+
+    explicit DeletionState(bool create_superversion = false) {
+      manifest_file_number = 0;
+      pending_manifest_file_number = 0;
+      log_number = 0;
+      prev_log_number = 0;
+      new_superversion = create_superversion ? new SuperVersion() : nullptr;
+    }
+
+    ~DeletionState() {
+      // free pending memtables
+      for (auto m : memtables_to_free) {
+        delete m;
+      }
+      // free superversions
+      for (auto s : superversions_to_free) {
+        delete s;
+      }
+      // if new_superversion was not used, it will be non-nullptr and needs
+      // to be freed here
+      delete new_superversion;
+    }
+  };
+
+  // Returns the list of live files in 'live' and the list
+  // of all files in the filesystem in 'candidate_files'.
+  // If force == false and the last call was less than
+  // options_.delete_obsolete_files_period_micros microseconds ago,
+  // it will not fill up the deletion_state
+  void FindObsoleteFiles(DeletionState& deletion_state,
+                         bool force,
+                         bool no_full_scan = false);
+
+  // Diffs the files listed in filenames and those that do not
+  // belong to live files are posibly removed. Also, removes all the
+  // files in sst_delete_files and log_delete_files.
+  // It is not necessary to hold the mutex when invoking this method.
+  void PurgeObsoleteFiles(DeletionState& deletion_state);
+
+  ColumnFamilyHandle* DefaultColumnFamily() const;
+
+ protected:
+  Env* const env_;
+  const std::string dbname_;
+  unique_ptr<VersionSet> versions_;
+  const DBOptions options_;
+
+  Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
+                                SuperVersion* super_version,
+                                Arena* arena = nullptr);
+
+ private:
+  friend class DB;
+  friend class InternalStats;
+#ifndef ROCKSDB_LITE
+  friend class TailingIterator;
+  friend class ForwardIterator;
+#endif
+  friend struct SuperVersion;
+  struct CompactionState;
+  struct Writer;
+
+  Status NewDB();
+
+  // Recover the descriptor from persistent storage.  May do a significant
+  // amount of work to recover recently logged updates.  Any changes to
+  // be made to the descriptor are added to *edit.
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only = false, bool error_if_log_file_exist = false);
+
+  void MaybeIgnoreError(Status* s) const;
+
+  const Status CreateArchivalDirectory();
+
+  // Delete any unneeded files and stale in-memory entries.
+  void DeleteObsoleteFiles();
+
+  // Flush the in-memory write buffer to storage.  Switches to a new
+  // log-file/memtable and writes a new descriptor iff successful.
+  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
+                                   DeletionState& deletion_state,
+                                   LogBuffer* log_buffer);
+
+  Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
+                        bool read_only);
+
+  // The following two methods are used to flush a memtable to
+  // storage. The first one is used atdatabase RecoveryTime (when the
+  // database is opened) and is heavyweight because it holds the mutex
+  // for the entire period. The second method WriteLevel0Table supports
+  // concurrent flush memtables to storage.
+  Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
+                                     VersionEdit* edit);
+  Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
+                          VersionEdit* edit, uint64_t* filenumber,
+                          LogBuffer* log_buffer);
+
+  uint64_t SlowdownAmount(int n, double bottom, double top);
+
+  // TODO(icanadi) free superversion_to_free and old_log outside of mutex
+  Status MakeRoomForWrite(ColumnFamilyData* cfd,
+                          bool force /* flush even if there is room? */,
+                          autovector<SuperVersion*>* superversions_to_free,
+                          autovector<log::Writer*>* logs_to_free);
+
+  void BuildBatchGroup(Writer** last_writer,
+                       autovector<WriteBatch*>* write_batch_group);
+
+  // Force current memtable contents to be flushed.
+  Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
+
+  // Wait for memtable flushed
+  Status WaitForFlushMemTable(ColumnFamilyData* cfd);
+
+  void MaybeScheduleLogDBDeployStats();
+
+#ifndef ROCKSDB_LITE
+  static void BGLogDBDeployStats(void* db);
+  void LogDBDeployStats();
+#endif  // ROCKSDB_LITE
+
+  void MaybeScheduleFlushOrCompaction();
+  static void BGWorkCompaction(void* db);
+  static void BGWorkFlush(void* db);
+  void BackgroundCallCompaction();
+  void BackgroundCallFlush();
+  Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state,
+                              LogBuffer* log_buffer);
+  Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state,
+                         LogBuffer* log_buffer);
+  void CleanupCompaction(CompactionState* compact, Status status);
+  Status DoCompactionWork(CompactionState* compact,
+                          DeletionState& deletion_state,
+                          LogBuffer* log_buffer);
+
+  // This function is called as part of compaction. It enables Flush process to
+  // preempt compaction, since it's higher prioirty
+  // Returns: micros spent executing
+  uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
+                                     DeletionState& deletion_state,
+                                     LogBuffer* log_buffer);
+
+  // Call compaction filter if is_compaction_v2 is not true. Then iterate
+  // through input and compact the kv-pairs
+  Status ProcessKeyValueCompaction(
+    SequenceNumber visible_at_tip,
+    SequenceNumber earliest_snapshot,
+    SequenceNumber latest_snapshot,
+    DeletionState& deletion_state,
+    bool bottommost_level,
+    int64_t& imm_micros,
+    Iterator* input,
+    CompactionState* compact,
+    bool is_compaction_v2,
+    LogBuffer* log_buffer);
+
+  // Call compaction_filter_v2->Filter() on kv-pairs in compact
+  void CallCompactionFilterV2(CompactionState* compact,
+    CompactionFilterV2* compaction_filter_v2);
+
+  Status OpenCompactionOutputFile(CompactionState* compact);
+  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
+  Status InstallCompactionResults(CompactionState* compact,
+                                  LogBuffer* log_buffer);
+  void AllocateCompactionOutputFileNumbers(CompactionState* compact);
+  void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
+
+#ifdef ROCKSDB_LITE
+  void PurgeObsoleteWALFiles() {
+    // this function is used for archiving WAL files. we don't need this in
+    // ROCKSDB_LITE
+  }
+#else
+  void PurgeObsoleteWALFiles();
+
+  Status GetSortedWalsOfType(const std::string& path,
+                             VectorLogPtr& log_files,
+                             WalFileType type);
+
+  // Requires: all_logs should be sorted with earliest log file first
+  // Retains all log files in all_logs which contain updates with seq no.
+  // Greater Than or Equal to the requested SequenceNumber.
+  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                const SequenceNumber target);
+
+  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+                         SequenceNumber* sequence);
+
+  Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
+#endif  // ROCKSDB_LITE
+
+  void PrintStatistics();
+
+  // dump rocksdb.stats to LOG
+  void MaybeDumpStats();
+
+  // Return true if the current db supports snapshot.  If the current
+  // DB does not support snapshot, then calling GetSnapshot() will always
+  // return nullptr.
+  //
+  // @see GetSnapshot()
+  virtual bool IsSnapshotSupported() const;
+
+  // Return the minimum empty level that could hold the total data in the
+  // input level. Return the input level, if such level could not be found.
+  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
+
+  // Move the files in the input level to the target level.
+  // If target_level < 0, automatically calculate the minimum level that could
+  // hold the data set.
+  Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
+
+  // table_cache_ provides its own synchronization
+  std::shared_ptr<Cache> table_cache_;
+
+  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
+  FileLock* db_lock_;
+
+  // State below is protected by mutex_
+  port::Mutex mutex_;
+  port::AtomicPointer shutting_down_;
+  // This condition variable is signaled on these conditions:
+  // * whenever bg_compaction_scheduled_ goes down to 0
+  // * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't
+  // made any progress
+  // * whenever a compaction made any progress
+  // * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is
+  // done, even if it didn't make any progress)
+  // * whenever there is an error in background flush or compaction
+  // * whenever bg_logstats_scheduled_ turns to false
+  port::CondVar bg_cv_;
+  uint64_t logfile_number_;
+  unique_ptr<log::Writer> log_;
+  bool log_empty_;
+  ColumnFamilyHandleImpl* default_cf_handle_;
+  unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
+  struct LogFileNumberSize {
+    explicit LogFileNumberSize(uint64_t _number)
+        : number(_number), size(0), getting_flushed(false) {}
+    void AddSize(uint64_t new_size) { size += new_size; }
+    uint64_t number;
+    uint64_t size;
+    bool getting_flushed;
+  };
+  std::deque<LogFileNumberSize> alive_log_files_;
+  uint64_t total_log_size_;
+  // only used for dynamically adjusting max_total_wal_size. it is a sum of
+  // [write_buffer_size * max_write_buffer_number] over all column families
+  uint64_t max_total_in_memory_state_;
+
+  std::string host_name_;
+
+  std::unique_ptr<Directory> db_directory_;
+
+  // Queue of writers.
+  std::deque<Writer*> writers_;
+  WriteBatch tmp_batch_;
+
+  SnapshotList snapshots_;
+
+  // cache for ReadFirstRecord() calls
+  std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
+  port::Mutex read_first_record_cache_mutex_;
+
+  // Set of table files to protect from deletion because they are
+  // part of ongoing compactions.
+  std::set<uint64_t> pending_outputs_;
+
+  // At least one compaction or flush job is pending but not yet scheduled
+  // because of the max background thread limit.
+  bool bg_schedule_needed_;
+
+  // count how many background compactions are running or have been scheduled
+  int bg_compaction_scheduled_;
+
+  // If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
+  // compactions (if manual_compaction_ is not null). This mechanism enables
+  // manual compactions to wait until all other compactions are finished.
+  int bg_manual_only_;
+
+  // number of background memtable flush jobs, submitted to the HIGH pool
+  int bg_flush_scheduled_;
+
+  // Has a background stats log thread scheduled?
+  bool bg_logstats_scheduled_;
+
+  // Information for a manual compaction
+  struct ManualCompaction {
+    ColumnFamilyData* cfd;
+    int input_level;
+    int output_level;
+    bool done;
+    Status status;
+    bool in_progress;           // compaction request being processed?
+    const InternalKey* begin;   // nullptr means beginning of key range
+    const InternalKey* end;     // nullptr means end of key range
+    InternalKey tmp_storage;    // Used to keep track of compaction progress
+  };
+  ManualCompaction* manual_compaction_;
+
+  // Have we encountered a background error in paranoid mode?
+  Status bg_error_;
+
+  std::unique_ptr<StatsLogger> logger_;
+
+  int64_t volatile last_log_ts;
+
+  // shall we disable deletion of obsolete files
+  // if 0 the deletion is enabled.
+  // if non-zero, files will not be getting deleted
+  // This enables two different threads to call
+  // EnableFileDeletions() and DisableFileDeletions()
+  // without any synchronization
+  int disable_delete_obsolete_files_;
+
+  // last time when DeleteObsoleteFiles was invoked
+  uint64_t delete_obsolete_files_last_run_;
+
+  // last time when PurgeObsoleteWALFiles ran.
+  uint64_t purge_wal_files_last_run_;
+
+  // last time stats were dumped to LOG
+  std::atomic<uint64_t> last_stats_dump_time_microsec_;
+
+  // obsolete files will be deleted every this seconds if ttl deletion is
+  // enabled and archive size_limit is disabled.
+  uint64_t default_interval_to_delete_obsolete_WAL_;
+
+  bool flush_on_destroy_; // Used when disableWAL is true.
+
+  static const int KEEP_LOG_FILE_NUM = 1000;
+  std::string db_absolute_path_;
+
+  // count of the number of contiguous delaying writes
+  int delayed_writes_;
+
+  // The options to access storage files
+  const EnvOptions storage_options_;
+
+  // A value of true temporarily disables scheduling of background work
+  bool bg_work_gate_closed_;
+
+  // Guard against multiple concurrent refitting
+  bool refitting_level_;
+
+  // Indicate DB was opened successfully
+  bool opened_successfully_;
+
+  // No copying allowed
+  DBImpl(const DBImpl&);
+  void operator=(const DBImpl&);
+
+  // dump the delayed_writes_ to the log file and reset counter.
+  void DelayLoggingAndReset();
+
+  // Return the earliest snapshot where seqno is visible.
+  // Store the snapshot right before that, if any, in prev_snapshot
+  inline SequenceNumber findEarliestVisibleSnapshot(
+    SequenceNumber in,
+    std::vector<SequenceNumber>& snapshots,
+    SequenceNumber* prev_snapshot);
+
+  // Background threads call this function, which is just a wrapper around
+  // the cfd->InstallSuperVersion() function. Background threads carry
+  // deletion_state which can have new_superversion already allocated.
+  void InstallSuperVersion(ColumnFamilyData* cfd,
+                           DeletionState& deletion_state);
+
+#ifndef ROCKSDB_LITE
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                          TablePropertiesCollection* props)
+      override;
+#endif  // ROCKSDB_LITE
+
+  // Function that Get and KeyMayExist call with no_io true or false
+  // Note: 'value_found' from KeyMayExist propagates here
+  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+                 const Slice& key, std::string* value,
+                 bool* value_found = nullptr);
+};
+
+// Sanitize db options.  The caller should delete result.info_log if
+// it is not equal to src.info_log.
+extern Options SanitizeOptions(const std::string& db,
+                               const InternalKeyComparator* icmp,
+                               const InternalFilterPolicy* ipolicy,
+                               const Options& src);
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const Options& options, int level,
+                                   const bool enable_compression);
+
+// Determine compression type for L0 file written by memtable flush.
+CompressionType GetCompressionFlush(const Options& options);
+
+}  // namespace rocksdb
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -0,0 +1,133 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl.h"
+
+namespace rocksdb {
+
+void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); }
+
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+  MutexLock l(&mutex_);
+  return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
+}
+
+Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+
+  mutex_.Lock();
+  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+  mutex_.Unlock();
+  ReadOptions roptions;
+  return NewInternalIterator(roptions, cfd, super_version);
+}
+
+int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
+    ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+  MutexLock l(&mutex_);
+  return cfd->current()->MaxNextLevelOverlappingBytes();
+}
+
+void DBImpl::TEST_GetFilesMetaData(
+    ColumnFamilyHandle* column_family,
+    std::vector<std::vector<FileMetaData>>* metadata) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  MutexLock l(&mutex_);
+  metadata->resize(NumberLevels());
+  for (int level = 0; level < NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = cfd->current()->files_[level];
+
+    (*metadata)[level].clear();
+    for (const auto& f : files) {
+      (*metadata)[level].push_back(*f);
+    }
+  }
+}
+
+uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+  return versions_->ManifestFileNumber();
+}
+
+Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
+                                 const Slice* end,
+                                 ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+  int output_level =
+      (cfd->options()->compaction_style == kCompactionStyleUniversal ||
+       cfd->options()->compaction_style == kCompactionStyleFIFO)
+          ? level
+          : level + 1;
+  return RunManualCompaction(cfd, level, output_level, begin, end);
+}
+
+Status DBImpl::TEST_FlushMemTable(bool wait) {
+  FlushOptions fo;
+  fo.wait = wait;
+  return FlushMemTable(default_cf_handle_->cfd(), fo);
+}
+
+Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    cfd = cfh->cfd();
+  }
+  return WaitForFlushMemTable(cfd);
+}
+
+Status DBImpl::TEST_WaitForCompact() {
+  // Wait until the compaction completes
+
+  // TODO: a bug here. This function actually does not necessarily
+  // wait for compact. It actually waits for scheduled compaction
+  // OR flush to finish.
+
+  MutexLock l(&mutex_);
+  while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) {
+    bg_cv_.Wait();
+  }
+  return bg_error_;
+}
+
+Status DBImpl::TEST_ReadFirstRecord(const WalFileType type,
+                                    const uint64_t number,
+                                    SequenceNumber* sequence) {
+  return ReadFirstRecord(type, number, sequence);
+}
+
+Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
+                                  SequenceNumber* sequence) {
+  return ReadFirstLine(fname, sequence);
+}
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -0,0 +1,154 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "db/db_impl_readonly.h"
+#include "db/db_impl.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include <algorithm>
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/merge_operator.h"
+#include "port/port.h"
+#include "table/block.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/build_version.h"
+
+namespace rocksdb {
+
+DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
+                               const std::string& dbname)
+    : DBImpl(options, dbname) {
+  Log(options_.info_log, "Opening the db in read only mode");
+}
+
+DBImplReadOnly::~DBImplReadOnly() {
+}
+
+// Implementations of the DB interface
+Status DBImplReadOnly::Get(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value) {
+  Status s;
+  SequenceNumber snapshot = versions_->LastSequence();
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* super_version = cfd->GetSuperVersion();
+  MergeContext merge_context;
+  LookupKey lkey(key, snapshot);
+  if (super_version->mem->Get(lkey, value, &s, merge_context,
+                              *cfd->options())) {
+  } else {
+    Version::GetStats stats;
+    super_version->current->Get(options, lkey, value, &s, &merge_context,
+                                &stats);
+  }
+  return s;
+}
+
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
+                                      ColumnFamilyHandle* column_family) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+  SequenceNumber latest_snapshot = versions_->LastSequence();
+  Iterator* internal_iter = NewInternalIterator(options, cfd, super_version);
+  return NewDBIterator(
+      env_, *cfd->options(), cfd->user_comparator(), internal_iter,
+      (options.snapshot != nullptr
+           ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+           : latest_snapshot));
+}
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+                           DB** dbptr, bool error_if_log_file_exist) {
+  *dbptr = nullptr;
+
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+
+  Status s =
+      DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a
+    // reference to default column family
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_log_file_exist) {
+  *dbptr = nullptr;
+  handles->clear();
+
+  DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
+  impl->mutex_.Lock();
+  Status s = impl->Recover(column_families, true /* read only */,
+                           error_if_log_file_exist);
+  if (s.ok()) {
+    // set column family handles
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (cfd == nullptr) {
+        s = Status::InvalidArgument("Column family not found: ", cf.name);
+        break;
+      }
+      handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+    }
+  }
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
+    }
+  }
+  impl->mutex_.Unlock();
+  if (s.ok()) {
+    *dbptr = impl;
+  } else {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+  }
+  return s;
+}
+
+
+}   // namespace rocksdb
--- a/db/db_impl_readonly.h
+++ b/db/db_impl_readonly.h
@@ -0,0 +1,103 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#pragma once
+#include "db/db_impl.h"
+
+#include <deque>
+#include <set>
+#include <vector>
+#include <string>
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/stats_logger.h"
+
+namespace rocksdb {
+
+class DBImplReadOnly : public DBImpl {
+ public:
+  DBImplReadOnly(const DBOptions& options, const std::string& dbname);
+  virtual ~DBImplReadOnly();
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value);
+
+  // TODO: Implement ReadOnly MultiGet?
+
+  using DBImpl::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions&,
+                                ColumnFamilyHandle* column_family);
+
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      std::vector<Iterator*>* iterators) {
+   // TODO
+    return Status::NotSupported("Not supported yet.");
+  }
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family, const Slice& key) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false,
+                              int target_level = -1) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status DisableFileDeletions() {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status EnableFileDeletions(bool force) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+ private:
+  friend class DB;
+
+  // No copying allowed
+  DBImplReadOnly(const DBImplReadOnly&);
+  void operator=(const DBImplReadOnly&);
+};
+}
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -0,0 +1,517 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+#include <stdexcept>
+#include <deque>
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "port/port.h"
+#include "util/arena.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+#if 0
+static void DumpInternalIter(Iterator* iter) {
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey k;
+    if (!ParseInternalKey(iter->key(), &k)) {
+      fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
+    } else {
+      fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
+    }
+  }
+}
+#endif
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter: public Iterator {
+ public:
+  // The following is grossly complicated. TODO: clean it up
+  // Which direction is the iterator currently moving?
+  // (1) When moving forward, the internal iterator is positioned at
+  //     the exact entry that yields this->key(), this->value()
+  // (2) When moving backwards, the internal iterator is positioned
+  //     just before all entries whose user key == this->key().
+  enum Direction {
+    kForward,
+    kReverse
+  };
+
+  DBIter(Env* env, const Options& options, const Comparator* cmp,
+         Iterator* iter, SequenceNumber s, bool arena_mode)
+      : arena_mode_(arena_mode),
+        env_(env),
+        logger_(options.info_log.get()),
+        user_comparator_(cmp),
+        user_merge_operator_(options.merge_operator.get()),
+        iter_(iter),
+        sequence_(s),
+        direction_(kForward),
+        valid_(false),
+        current_entry_is_merged_(false),
+        statistics_(options.statistics.get()) {
+    RecordTick(statistics_, NO_ITERATORS, 1);
+    max_skip_ = options.max_sequential_skip_in_iterations;
+  }
+  virtual ~DBIter() {
+    RecordTick(statistics_, NO_ITERATORS, -1);
+    if (!arena_mode_) {
+      delete iter_;
+    } else {
+      iter_->~Iterator();
+    }
+  }
+  virtual void SetIter(Iterator* iter) {
+    assert(iter_ == nullptr);
+    iter_ = iter;
+  }
+  virtual bool Valid() const { return valid_; }
+  virtual Slice key() const {
+    assert(valid_);
+    return saved_key_.GetKey();
+  }
+  virtual Slice value() const {
+    assert(valid_);
+    return (direction_ == kForward && !current_entry_is_merged_) ?
+      iter_->value() : saved_value_;
+  }
+  virtual Status status() const {
+    if (status_.ok()) {
+      return iter_->status();
+    } else {
+      return status_;
+    }
+  }
+
+  virtual void Next();
+  virtual void Prev();
+  virtual void Seek(const Slice& target);
+  virtual void SeekToFirst();
+  virtual void SeekToLast();
+
+ private:
+  inline void FindNextUserEntry(bool skipping);
+  void FindNextUserEntryInternal(bool skipping);
+  void FindPrevUserEntry();
+  bool ParseKey(ParsedInternalKey* key);
+  void MergeValuesNewToOld();
+
+  inline void ClearSavedValue() {
+    if (saved_value_.capacity() > 1048576) {
+      std::string empty;
+      swap(empty, saved_value_);
+    } else {
+      saved_value_.clear();
+    }
+  }
+
+  bool arena_mode_;
+  Env* const env_;
+  Logger* logger_;
+  const Comparator* const user_comparator_;
+  const MergeOperator* const user_merge_operator_;
+  Iterator* iter_;
+  SequenceNumber const sequence_;
+
+  Status status_;
+  IterKey saved_key_;   // == current key when direction_==kReverse
+  std::string saved_value_;   // == current raw value when direction_==kReverse
+  Direction direction_;
+  bool valid_;
+  bool current_entry_is_merged_;
+  Statistics* statistics_;
+  uint64_t max_skip_;
+
+  // No copying allowed
+  DBIter(const DBIter&);
+  void operator=(const DBIter&);
+};
+
+inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+  if (!ParseInternalKey(iter_->key(), ikey)) {
+    status_ = Status::Corruption("corrupted internal key in DBIter");
+    Log(logger_, "corrupted internal key in DBIter: %s",
+        iter_->key().ToString(true).c_str());
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void DBIter::Next() {
+  assert(valid_);
+
+  if (direction_ == kReverse) {  // Switch directions?
+    direction_ = kForward;
+    // iter_ is pointing just before the entries for this->key(),
+    // so advance into the range of entries for this->key() and then
+    // use the normal skipping code below.
+    if (!iter_->Valid()) {
+      iter_->SeekToFirst();
+    } else {
+      iter_->Next();
+    }
+    if (!iter_->Valid()) {
+      valid_ = false;
+      saved_key_.Clear();
+      return;
+    }
+  }
+
+  // If the current value is merged, we might already hit end of iter_
+  if (!iter_->Valid()) {
+    valid_ = false;
+    return;
+  }
+  FindNextUserEntry(true /* skipping the current user key */);
+}
+
+
+// PRE: saved_key_ has the current user key if skipping
+// POST: saved_key_ should have the next user key if valid_,
+//       if the current entry is a result of merge
+//           current_entry_is_merged_ => true
+//           saved_value_             => the merged value
+//
+// NOTE: In between, saved_key_ can point to a user key that has
+//       a delete marker
+inline void DBIter::FindNextUserEntry(bool skipping) {
+  PERF_TIMER_AUTO(find_next_user_entry_time);
+  FindNextUserEntryInternal(skipping);
+  PERF_TIMER_STOP(find_next_user_entry_time);
+}
+
+// Actual implementation of DBIter::FindNextUserEntry()
+void DBIter::FindNextUserEntryInternal(bool skipping) {
+  // Loop until we hit an acceptable entry to yield
+  assert(iter_->Valid());
+  assert(direction_ == kForward);
+  current_entry_is_merged_ = false;
+  uint64_t num_skipped = 0;
+  do {
+    ParsedInternalKey ikey;
+    if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+      if (skipping &&
+          user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
+        num_skipped++; // skip this entry
+        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+      } else {
+        skipping = false;
+        switch (ikey.type) {
+          case kTypeDeletion:
+            // Arrange to skip all upcoming entries for this key since
+            // they are hidden by this deletion.
+            saved_key_.SetKey(ikey.user_key);
+            skipping = true;
+            num_skipped = 0;
+            PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+            break;
+          case kTypeValue:
+            valid_ = true;
+            saved_key_.SetKey(ikey.user_key);
+            return;
+          case kTypeMerge:
+            // By now, we are sure the current ikey is going to yield a value
+            saved_key_.SetKey(ikey.user_key);
+            current_entry_is_merged_ = true;
+            valid_ = true;
+            MergeValuesNewToOld();  // Go to a different state machine
+            return;
+          default:
+            assert(false);
+            break;
+        }
+      }
+    }
+    // If we have sequentially iterated via numerous keys and still not
+    // found the next user-key, then it is better to seek so that we can
+    // avoid too many key comparisons. We seek to the last occurence of
+    // our current key by looking for sequence number 0.
+    if (skipping && num_skipped > max_skip_) {
+      num_skipped = 0;
+      std::string last_key;
+      AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0,
+                                                     kValueTypeForSeek));
+      iter_->Seek(last_key);
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    } else {
+      iter_->Next();
+    }
+  } while (iter_->Valid());
+  valid_ = false;
+}
+
+// Merge values of the same user key starting from the current iter_ position
+// Scan from the newer entries to older entries.
+// PRE: iter_->key() points to the first merge type entry
+//      saved_key_ stores the user key
+// POST: saved_value_ has the merged value for the user key
+//       iter_ points to the next entry (or invalid)
+void DBIter::MergeValuesNewToOld() {
+  if (!user_merge_operator_) {
+    Log(logger_, "Options::merge_operator is null.");
+    throw std::logic_error("DBIter::MergeValuesNewToOld() with"
+                           " Options::merge_operator null");
+  }
+
+  // Start the merge process by pushing the first operand
+  std::deque<std::string> operands;
+  operands.push_front(iter_->value().ToString());
+
+  std::string merge_result;   // Temporary string to hold merge result later
+  ParsedInternalKey ikey;
+  for (iter_->Next(); iter_->Valid(); iter_->Next()) {
+    if (!ParseKey(&ikey)) {
+      // skip corrupted key
+      continue;
+    }
+
+    if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) {
+      // hit the next user key, stop right here
+      break;
+    }
+
+    if (kTypeDeletion == ikey.type) {
+      // hit a delete with the same user key, stop right here
+      // iter_ is positioned after delete
+      iter_->Next();
+      break;
+    }
+
+    if (kTypeValue == ikey.type) {
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      // ignore corruption if there is any.
+      const Slice value = iter_->value();
+      user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
+                                      &saved_value_, logger_);
+      // iter_ is positioned after put
+      iter_->Next();
+      return;
+    }
+
+    if (kTypeMerge == ikey.type) {
+      // hit a merge, add the value as an operand and run associative merge.
+      // when complete, add result to operands and continue.
+      const Slice& value = iter_->value();
+      operands.push_front(value.ToString());
+    }
+  }
+
+  // we either exhausted all internal keys under this user key, or hit
+  // a deletion marker.
+  // feed null as the existing value to the merge operator, such that
+  // client can differentiate this scenario and do things accordingly.
+  user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands,
+                                  &saved_value_, logger_);
+}
+
+void DBIter::Prev() {
+  assert(valid_);
+
+  // Throw an exception now if merge_operator is provided
+  // TODO: support backward iteration
+  if (user_merge_operator_) {
+    Log(logger_, "Prev not supported yet if merge_operator is provided");
+    throw std::logic_error("DBIter::Prev backward iteration not supported"
+                           " if merge_operator is provided");
+  }
+
+  if (direction_ == kForward) {  // Switch directions?
+    // iter_ is pointing at the current entry.  Scan backwards until
+    // the key changes so we can use the normal reverse scanning code.
+    assert(iter_->Valid());  // Otherwise valid_ would have been false
+    saved_key_.SetKey(ExtractUserKey(iter_->key()));
+    while (true) {
+      iter_->Prev();
+      if (!iter_->Valid()) {
+        valid_ = false;
+        saved_key_.Clear();
+        ClearSavedValue();
+        return;
+      }
+      if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
+                                    saved_key_.GetKey()) < 0) {
+        break;
+      }
+    }
+    direction_ = kReverse;
+  }
+
+  FindPrevUserEntry();
+}
+
+void DBIter::FindPrevUserEntry() {
+  assert(direction_ == kReverse);
+  uint64_t num_skipped = 0;
+
+  ValueType value_type = kTypeDeletion;
+  bool saved_key_valid = true;
+  if (iter_->Valid()) {
+    do {
+      ParsedInternalKey ikey;
+      if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+        if ((value_type != kTypeDeletion) &&
+            user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) < 0) {
+          // We encountered a non-deleted value in entries for previous keys,
+          break;
+        }
+        value_type = ikey.type;
+        if (value_type == kTypeDeletion) {
+          saved_key_.Clear();
+          ClearSavedValue();
+          saved_key_valid = false;
+        } else {
+          Slice raw_value = iter_->value();
+          if (saved_value_.capacity() > raw_value.size() + 1048576) {
+            std::string empty;
+            swap(empty, saved_value_);
+          }
+          saved_key_.SetKey(ExtractUserKey(iter_->key()));
+          saved_value_.assign(raw_value.data(), raw_value.size());
+        }
+      } else {
+        // In the case of ikey.sequence > sequence_, we might have already
+        // iterated to a different user key.
+        saved_key_valid = false;
+      }
+      num_skipped++;
+      // If we have sequentially iterated via numerous keys and still not
+      // found the prev user-key, then it is better to seek so that we can
+      // avoid too many key comparisons. We seek to the first occurence of
+      // our current key by looking for max sequence number.
+      if (saved_key_valid && num_skipped > max_skip_) {
+        num_skipped = 0;
+        std::string last_key;
+        AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(),
+                                                       kMaxSequenceNumber,
+                                                       kValueTypeForSeek));
+        iter_->Seek(last_key);
+        RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+      } else {
+        iter_->Prev();
+      }
+    } while (iter_->Valid());
+  }
+
+  if (value_type == kTypeDeletion) {
+    // End
+    valid_ = false;
+    saved_key_.Clear();
+    ClearSavedValue();
+    direction_ = kForward;
+  } else {
+    valid_ = true;
+  }
+}
+
+void DBIter::Seek(const Slice& target) {
+  saved_key_.Clear();
+  // now savved_key is used to store internal key.
+  saved_key_.SetInternalKey(target, sequence_);
+  PERF_TIMER_AUTO(seek_internal_seek_time);
+  iter_->Seek(saved_key_.GetKey());
+  PERF_TIMER_STOP(seek_internal_seek_time);
+  if (iter_->Valid()) {
+    direction_ = kForward;
+    ClearSavedValue();
+    FindNextUserEntry(false /*not skipping */);
+  } else {
+    valid_ = false;
+  }
+}
+
+void DBIter::SeekToFirst() {
+  direction_ = kForward;
+  ClearSavedValue();
+  PERF_TIMER_AUTO(seek_internal_seek_time);
+  iter_->SeekToFirst();
+  PERF_TIMER_STOP(seek_internal_seek_time);
+  if (iter_->Valid()) {
+    FindNextUserEntry(false /* not skipping */);
+  } else {
+    valid_ = false;
+  }
+}
+
+void DBIter::SeekToLast() {
+  // Throw an exception for now if merge_operator is provided
+  // TODO: support backward iteration
+  if (user_merge_operator_) {
+    Log(logger_, "SeekToLast not supported yet if merge_operator is provided");
+    throw std::logic_error("DBIter::SeekToLast: backward iteration not"
+                           " supported if merge_operator is provided");
+  }
+
+  direction_ = kReverse;
+  ClearSavedValue();
+  PERF_TIMER_AUTO(seek_internal_seek_time);
+  iter_->SeekToLast();
+  PERF_TIMER_STOP(seek_internal_seek_time);
+  FindPrevUserEntry();
+}
+
+Iterator* NewDBIterator(Env* env, const Options& options,
+                        const Comparator* user_key_comparator,
+                        Iterator* internal_iter,
+                        const SequenceNumber& sequence) {
+  return new DBIter(env, options, user_key_comparator, internal_iter, sequence,
+                    false);
+}
+
+ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
+
+void ArenaWrappedDBIter::SetDBIter(DBIter* iter) { db_iter_ = iter; }
+
+void ArenaWrappedDBIter::SetIterUnderDBIter(Iterator* iter) {
+  static_cast<DBIter*>(db_iter_)->SetIter(iter);
+}
+
+inline bool ArenaWrappedDBIter::Valid() const { return db_iter_->Valid(); }
+inline void ArenaWrappedDBIter::SeekToFirst() { db_iter_->SeekToFirst(); }
+inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); }
+inline void ArenaWrappedDBIter::Seek(const Slice& target) {
+  db_iter_->Seek(target);
+}
+inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); }
+inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
+inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
+inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
+inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
+void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
+                                         void* arg2) {
+  db_iter_->RegisterCleanup(function, arg1, arg2);
+}
+
+ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const Options& options, const Comparator* user_key_comparator,
+    const SequenceNumber& sequence) {
+  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
+  Arena* arena = iter->GetArena();
+  auto mem = arena->AllocateAligned(sizeof(DBIter));
+  DBIter* db_iter = new (mem)
+      DBIter(env, options, user_key_comparator, nullptr, sequence, true);
+  iter->SetDBIter(db_iter);
+  return iter;
+}
+
+}  // namespace rocksdb
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -0,0 +1,73 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include "rocksdb/db.h"
+#include "db/dbformat.h"
+#include "util/arena.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+class Arena;
+class DBIter;
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified "sequence" number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+    Env* env,
+    const Options& options,
+    const Comparator *user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence);
+
+// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
+// iterator is supposed be allocated. This class is used as an entry point of
+// a iterator hierarchy whose memory can be allocated inline. In that way,
+// accessing the iterator tree can be more cache friendly. It is also faster
+// to allocate.
+class ArenaWrappedDBIter : public Iterator {
+ public:
+  virtual ~ArenaWrappedDBIter();
+
+  // Get the arena to be used to allocate memory for DBIter to be wrapped,
+  // as well as child iterators in it.
+  virtual Arena* GetArena() { return &arena_; }
+
+  // Set the DB Iterator to be wrapped
+
+  virtual void SetDBIter(DBIter* iter);
+
+  // Set the internal iterator wrapped inside the DB Iterator. Usually it is
+  // a merging iterator.
+  virtual void SetIterUnderDBIter(Iterator* iter);
+  virtual bool Valid() const override;
+  virtual void SeekToFirst() override;
+  virtual void SeekToLast() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void Next() override;
+  virtual void Prev() override;
+  virtual Slice key() const override;
+  virtual Slice value() const override;
+  virtual Status status() const override;
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+  DBIter* db_iter_;
+  Arena arena_;
+};
+
+// Generate the arena wrapped iterator class.
+extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const Options& options, const Comparator* user_key_comparator,
+    const SequenceNumber& sequence);
+
+}  // namespace rocksdb
--- a/db/db_stats_logger.cc
+++ b/db/db_stats_logger.cc
@@ -0,0 +1,95 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+void DBImpl::MaybeScheduleLogDBDeployStats() {
+// we did say maybe
+#ifndef ROCKSDB_LITE
+  // There is a lock in the actual logger.
+  if (!logger_ || options_.db_stats_log_interval < 0
+      || host_name_.empty()) {
+    return;
+  }
+
+  if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) {
+    // Already scheduled
+  } else {
+    int64_t current_ts = 0;
+    Status st = env_->GetCurrentTime(&current_ts);
+    if (!st.ok()) {
+      return;
+    }
+    if ((current_ts - last_log_ts) < options_.db_stats_log_interval) {
+      return;
+    }
+    last_log_ts = current_ts;
+    bg_logstats_scheduled_ = true;
+    env_->Schedule(&DBImpl::BGLogDBDeployStats, this);
+  }
+}
+
+void DBImpl::BGLogDBDeployStats(void* db) {
+  DBImpl* db_inst = reinterpret_cast<DBImpl*>(db);
+  db_inst->LogDBDeployStats();
+}
+
+void DBImpl::LogDBDeployStats() {
+  mutex_.Lock();
+
+  if (shutting_down_.Acquire_Load()) {
+    bg_logstats_scheduled_ = false;
+    bg_cv_.SignalAll();
+    mutex_.Unlock();
+    return;
+  }
+
+  char tmp_ver[100];
+  sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion);
+  std::string version_info(tmp_ver);
+
+  uint64_t file_total_size = 0;
+  uint32_t file_total_num = 0;
+  Version* current = default_cf_handle_->cfd()->current();
+  for (int i = 0; i < current->NumberLevels(); i++) {
+    file_total_num += current->NumLevelFiles(i);
+    file_total_size += current->NumLevelBytes(i);
+  }
+
+  Version::LevelSummaryStorage scratch;
+  const char* file_num_summary = current->LevelSummary(&scratch);
+  std::string file_num_per_level(file_num_summary);
+  std::string data_size_per_level(file_num_summary);
+
+  mutex_.Unlock();
+
+  int64_t unix_ts;
+  env_->GetCurrentTime(&unix_ts);
+
+  logger_->Log_Deploy_Stats(version_info, host_name_,
+      db_absolute_path_, file_total_size, file_total_num, file_num_per_level,
+      data_size_per_level, unix_ts);
+
+  mutex_.Lock();
+  bg_logstats_scheduled_ = false;
+  bg_cv_.SignalAll();
+  mutex_.Unlock();
+#endif
+}
+}
--- a/db/db_test.cc
+++ b/db/db_test.cc
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -0,0 +1,169 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/dbformat.h"
+
+#include <stdio.h>
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+  assert(seq <= kMaxSequenceNumber);
+  assert(t <= kValueTypeForSeek);
+  return (seq << 8) | t;
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+  result->append(key.user_key.data(), key.user_key.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+std::string ParsedInternalKey::DebugString(bool hex) const {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "' @ %llu : %d",
+           (unsigned long long) sequence,
+           int(type));
+  std::string result = "'";
+  result += user_key.ToString(hex);
+  result += buf;
+  return result;
+}
+
+std::string InternalKey::DebugString(bool hex) const {
+  std::string result;
+  ParsedInternalKey parsed;
+  if (ParseInternalKey(rep_, &parsed)) {
+    result = parsed.DebugString(hex);
+  } else {
+    result = "(bad)";
+    result.append(EscapeString(rep_));
+  }
+  return result;
+}
+
+const char* InternalKeyComparator::Name() const {
+  return name_.c_str();
+}
+
+int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  PERF_COUNTER_ADD(user_key_comparison_count, 1);
+  if (r == 0) {
+    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+int InternalKeyComparator::Compare(const ParsedInternalKey& a,
+                                   const ParsedInternalKey& b) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_->Compare(a.user_key, b.user_key);
+  PERF_COUNTER_ADD(user_key_comparison_count, 1);
+  if (r == 0) {
+    if (a.sequence > b.sequence) {
+      r = -1;
+    } else if (a.sequence < b.sequence) {
+      r = +1;
+    } else if (a.type > b.type) {
+      r = -1;
+    } else if (a.type < b.type) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+  // Attempt to shorten the user portion of the key
+  Slice user_start = ExtractUserKey(*start);
+  Slice user_limit = ExtractUserKey(limit);
+  std::string tmp(user_start.data(), user_start.size());
+  user_comparator_->FindShortestSeparator(&tmp, user_limit);
+  if (tmp.size() < user_start.size() &&
+      user_comparator_->Compare(user_start, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*start, tmp) < 0);
+    assert(this->Compare(tmp, limit) < 0);
+    start->swap(tmp);
+  }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+  Slice user_key = ExtractUserKey(*key);
+  std::string tmp(user_key.data(), user_key.size());
+  user_comparator_->FindShortSuccessor(&tmp);
+  if (tmp.size() < user_key.size() &&
+      user_comparator_->Compare(user_key, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*key, tmp) < 0);
+    key->swap(tmp);
+  }
+}
+
+const char* InternalFilterPolicy::Name() const {
+  return user_policy_->Name();
+}
+
+void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
+                                        std::string* dst) const {
+  // We rely on the fact that the code in table.cc does not mind us
+  // adjusting keys[].
+  Slice* mkey = const_cast<Slice*>(keys);
+  for (int i = 0; i < n; i++) {
+    mkey[i] = ExtractUserKey(keys[i]);
+    // TODO(sanjay): Suppress dups?
+  }
+  user_policy_->CreateFilter(keys, n, dst);
+}
+
+bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
+  return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
+}
+
+LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
+  size_t usize = user_key.size();
+  size_t needed = usize + 13;  // A conservative estimate
+  char* dst;
+  if (needed <= sizeof(space_)) {
+    dst = space_;
+  } else {
+    dst = new char[needed];
+  }
+  start_ = dst;
+  dst = EncodeVarint32(dst, usize + 8);
+  kstart_ = dst;
+  memcpy(dst, user_key.data(), usize);
+  dst += usize;
+  EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+  dst += 8;
+  end_ = dst;
+}
+
+}  // namespace rocksdb
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -0,0 +1,345 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdio.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+// The highest bit of the value type needs to be reserved to SST tables
+// for them to do more flexible encoding.
+enum ValueType : unsigned char {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1,
+  kTypeMerge = 0x2,
+  // Following types are used only in write ahead logs. They are not used in
+  // memtables or sst files:
+  kTypeLogData = 0x3,
+  kTypeColumnFamilyDeletion = 0x4,
+  kTypeColumnFamilyValue = 0x5,
+  kTypeColumnFamilyMerge = 0x6,
+  kMaxValue = 0x7F
+};
+
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+static const ValueType kValueTypeForSeek = kTypeMerge;
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber =
+    ((0x1ull << 56) - 1);
+
+struct ParsedInternalKey {
+  Slice user_key;
+  SequenceNumber sequence;
+  ValueType type;
+
+  ParsedInternalKey() { }  // Intentionally left uninitialized (for speed)
+  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+      : user_key(u), sequence(seq), type(t) { }
+  std::string DebugString(bool hex = false) const;
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + 8;
+}
+
+extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t);
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+                              const ParsedInternalKey& key);
+
+// Attempt to parse an internal key from "internal_key".  On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  const size_t n = internal_key.size();
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator : public Comparator {
+ private:
+  const Comparator* user_comparator_;
+  std::string name_;
+ public:
+  explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c),
+    name_("rocksdb.InternalKeyComparator:" +
+          std::string(user_comparator_->Name())) {
+  }
+  virtual ~InternalKeyComparator() {}
+
+  virtual const char* Name() const;
+  virtual int Compare(const Slice& a, const Slice& b) const;
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const;
+  virtual void FindShortSuccessor(std::string* key) const;
+
+  const Comparator* user_comparator() const { return user_comparator_; }
+
+  int Compare(const InternalKey& a, const InternalKey& b) const;
+  int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
+};
+
+// Filter policy wrapper that converts from internal keys to user keys
+class InternalFilterPolicy : public FilterPolicy {
+ private:
+  const FilterPolicy* const user_policy_;
+ public:
+  explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
+  virtual const char* Name() const;
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
+};
+
+// Modules in this directory should keep internal keys wrapped inside
+// the following class instead of plain strings so that we do not
+// incorrectly use string comparisons instead of an InternalKeyComparator.
+class InternalKey {
+ private:
+  std::string rep_;
+ public:
+  InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
+  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  }
+
+  bool Valid() const {
+    ParsedInternalKey parsed;
+    return ParseInternalKey(Slice(rep_), &parsed);
+  }
+
+  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+  Slice Encode() const {
+    assert(!rep_.empty());
+    return rep_;
+  }
+
+  Slice user_key() const { return ExtractUserKey(rep_); }
+
+  void SetFrom(const ParsedInternalKey& p) {
+    rep_.clear();
+    AppendInternalKey(&rep_, p);
+  }
+
+  void Clear() { rep_.clear(); }
+
+  std::string DebugString(bool hex = false) const;
+};
+
+inline int InternalKeyComparator::Compare(
+    const InternalKey& a, const InternalKey& b) const {
+  return Compare(a.Encode(), b.Encode());
+}
+
+inline bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result) {
+  const size_t n = internal_key.size();
+  if (n < 8) return false;
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  result->sequence = num >> 8;
+  result->type = static_cast<ValueType>(c);
+  assert(result->type <= ValueType::kMaxValue);
+  result->user_key = Slice(internal_key.data(), n - 8);
+  return (c <= static_cast<unsigned char>(kValueTypeForSeek));
+}
+
+// Update the sequence number in the internal key
+inline void UpdateInternalKey(char* internal_key,
+                              const size_t internal_key_size,
+                              uint64_t seq, ValueType t) {
+  assert(internal_key_size >= 8);
+  char* seqtype = internal_key + internal_key_size - 8;
+  uint64_t newval = (seq << 8) | t;
+  EncodeFixed64(seqtype, newval);
+}
+
+// Get the sequence number from the internal key
+inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
+  const size_t n = internal_key.size();
+  assert(n >= 8);
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  return num >> 8;
+}
+
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+  // Initialize *this for looking up user_key at a snapshot with
+  // the specified sequence number.
+  LookupKey(const Slice& user_key, SequenceNumber sequence);
+
+  ~LookupKey();
+
+  // Return a key suitable for lookup in a MemTable.
+  Slice memtable_key() const { return Slice(start_, end_ - start_); }
+
+  // Return an internal key (suitable for passing to an internal iterator)
+  Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+
+  // Return the user key
+  Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+
+ private:
+  // We construct a char array of the form:
+  //    klength  varint32               <-- start_
+  //    userkey  char[klength]          <-- kstart_
+  //    tag      uint64
+  //                                    <-- end_
+  // The array is a suitable MemTable key.
+  // The suffix starting with "userkey" can be used as an InternalKey.
+  const char* start_;
+  const char* kstart_;
+  const char* end_;
+  char space_[200];      // Avoid allocation for short keys
+
+  // No copying allowed
+  LookupKey(const LookupKey&);
+  void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+  if (start_ != space_) delete[] start_;
+}
+
+class IterKey {
+ public:
+  IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
+
+  ~IterKey() { ResetBuffer(); }
+
+  Slice GetKey() const { return Slice(key_, key_size_); }
+
+  void Clear() { key_size_ = 0; }
+
+  void SetKey(const Slice& key) {
+    size_t size = key.size();
+    EnlargeBufferIfNeeded(size);
+    memcpy(key_, key.data(), size);
+    key_size_ = size;
+  }
+
+  void SetInternalKey(const Slice& user_key, SequenceNumber s,
+                      ValueType value_type = kValueTypeForSeek) {
+    size_t usize = user_key.size();
+    EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
+    memcpy(key_, user_key.data(), usize);
+    EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type));
+    key_size_ = usize + sizeof(uint64_t);
+  }
+
+  void SetInternalKey(const ParsedInternalKey& parsed_key) {
+    SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type);
+  }
+
+ private:
+  char* key_;
+  size_t buf_size_;
+  size_t key_size_;
+  char space_[32];  // Avoid allocation for short keys
+
+  void ResetBuffer() {
+    if (key_ != nullptr && key_ != space_) {
+      delete[] key_;
+    }
+    key_ = space_;
+    buf_size_ = sizeof(space_);
+    key_size_ = 0;
+  }
+
+  // Enlarge the buffer size if needed based on key_size.
+  // By default, static allocated buffer is used. Once there is a key
+  // larger than the static allocated buffer, another buffer is dynamically
+  // allocated, until a larger key buffer is requested. In that case, we
+  // reallocate buffer and delete the old one.
+  void EnlargeBufferIfNeeded(size_t key_size) {
+    // If size is smaller than buffer size, continue using current buffer,
+    // or the static allocated one, as default
+    if (key_size > buf_size_) {
+      // Need to enlarge the buffer.
+      ResetBuffer();
+      key_ = new char[key_size];
+      buf_size_ = key_size;
+    }
+  }
+
+  // No copying allowed
+  IterKey(const IterKey&) = delete;
+  void operator=(const IterKey&) = delete;
+};
+
+class InternalKeySliceTransform : public SliceTransform {
+ public:
+  explicit InternalKeySliceTransform(const SliceTransform* transform)
+      : transform_(transform) {}
+
+  virtual const char* Name() const { return transform_->Name(); }
+
+  virtual Slice Transform(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->Transform(user_key);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->InDomain(user_key);
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    auto user_key = ExtractUserKey(dst);
+    return transform_->InRange(user_key);
+  }
+
+  const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+  // Like comparator, InternalKeySliceTransform will not take care of the
+  // deletion of transform_
+  const SliceTransform* const transform_;
+};
+
+}  // namespace rocksdb
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
@@ -0,0 +1,117 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static std::string IKey(const std::string& user_key,
+                        uint64_t seq,
+                        ValueType vt) {
+  std::string encoded;
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+  return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+  return result;
+}
+
+static void TestKey(const std::string& key,
+                    uint64_t seq,
+                    ValueType vt) {
+  std::string encoded = IKey(key, seq, vt);
+
+  Slice in(encoded);
+  ParsedInternalKey decoded("", 0, kTypeValue);
+
+  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_EQ(key, decoded.user_key.ToString());
+  ASSERT_EQ(seq, decoded.sequence);
+  ASSERT_EQ(vt, decoded.type);
+
+  ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest { };
+
+TEST(FormatTest, InternalKey_EncodeDecode) {
+  const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
+  const uint64_t seq[] = {
+    1, 2, 3,
+    (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
+    (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
+    (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
+  };
+  for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+    for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+      TestKey(keys[k], seq[s], kTypeValue);
+      TestKey("hello", 1, kTypeDeletion);
+    }
+  }
+}
+
+TEST(FormatTest, InternalKeyShortSeparator) {
+  // When user keys are same
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 101, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeDeletion)));
+
+  // When user keys are misordered
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("bar", 99, kTypeValue)));
+
+  // When user keys are different, but correctly ordered
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("hello", 200, kTypeValue)));
+
+  // When start user key is prefix of limit user key
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foobar", 200, kTypeValue)));
+
+  // When limit user key is prefix of start user key
+  ASSERT_EQ(IKey("foobar", 100, kTypeValue),
+            Shorten(IKey("foobar", 100, kTypeValue),
+                    IKey("foo", 200, kTypeValue)));
+}
+
+TEST(FormatTest, InternalKeyShortestSuccessor) {
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -0,0 +1,295 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include <vector>
+#include <stdlib.h>
+#include <map>
+#include <string>
+
+namespace rocksdb {
+
+class DeleteFileTest {
+ public:
+  std::string dbname_;
+  Options options_;
+  DB* db_;
+  Env* env_;
+  int numlevels_;
+
+  DeleteFileTest() {
+    db_ = nullptr;
+    env_ = Env::Default();
+    options_.write_buffer_size = 1024*1024*1000;
+    options_.target_file_size_base = 1024*1024*1000;
+    options_.max_bytes_for_level_base = 1024*1024*1000;
+    options_.WAL_ttl_seconds = 300; // Used to test log files
+    options_.WAL_size_limit_MB = 1024; // Used to test log files
+    dbname_ = test::TmpDir() + "/deletefile_test";
+    options_.wal_dir = dbname_ + "/wal_files";
+
+    // clean up all the files that might have been there before
+    std::vector<std::string> old_files;
+    env_->GetChildren(dbname_, &old_files);
+    for (auto file : old_files) {
+      env_->DeleteFile(dbname_ + "/" + file);
+    }
+    env_->GetChildren(options_.wal_dir, &old_files);
+    for (auto file : old_files) {
+      env_->DeleteFile(options_.wal_dir + "/" + file);
+    }
+
+    DestroyDB(dbname_, options_);
+    numlevels_ = 7;
+    ASSERT_OK(ReopenDB(true));
+  }
+
+  Status ReopenDB(bool create) {
+    delete db_;
+    if (create) {
+      DestroyDB(dbname_, options_);
+    }
+    db_ = nullptr;
+    options_.create_if_missing = create;
+    return DB::Open(options_, dbname_, &db_);
+  }
+
+  void CloseDB() {
+    delete db_;
+  }
+
+  void AddKeys(int numkeys, int startkey = 0) {
+    WriteOptions options;
+    options.sync = false;
+    ReadOptions roptions;
+    for (int i = startkey; i < (numkeys + startkey) ; i++) {
+      std::string temp = std::to_string(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  int numKeysInLevels(
+    std::vector<LiveFileMetaData> &metadata,
+    std::vector<int> *keysperlevel = nullptr) {
+
+    if (keysperlevel != nullptr) {
+      keysperlevel->resize(numlevels_);
+    }
+
+    int numKeys = 0;
+    for (size_t i = 0; i < metadata.size(); i++) {
+      int startkey = atoi(metadata[i].smallestkey.c_str());
+      int endkey = atoi(metadata[i].largestkey.c_str());
+      int numkeysinfile = (endkey - startkey + 1);
+      numKeys += numkeysinfile;
+      if (keysperlevel != nullptr) {
+        (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
+      }
+      fprintf(stderr, "level %d name %s smallest %s largest %s\n",
+              metadata[i].level, metadata[i].name.c_str(),
+              metadata[i].smallestkey.c_str(),
+              metadata[i].largestkey.c_str());
+    }
+    return numKeys;
+  }
+
+  void CreateTwoLevels() {
+    AddKeys(50000, 10000);
+    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+
+    AddKeys(50000, 10000);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+  }
+
+  void CheckFileTypeCounts(std::string& dir,
+                            int required_log,
+                            int required_sst,
+                            int required_manifest) {
+    std::vector<std::string> filenames;
+    env_->GetChildren(dir, &filenames);
+
+    int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kLogFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    ASSERT_EQ(required_log, log_cnt);
+    ASSERT_EQ(required_sst, sst_cnt);
+    ASSERT_EQ(required_manifest, manifest_cnt);
+  }
+
+};
+
+TEST(DeleteFileTest, AddKeysAndQueryLevels) {
+  CreateTwoLevels();
+  std::vector<LiveFileMetaData> metadata;
+  std::vector<int> keysinlevel;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level1file = "";
+  int level1keycount = 0;
+  std::string level2file = "";
+  int level2keycount = 0;
+  int level1index = 0;
+  int level2index = 1;
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 2) {
+    level1index = 1;
+    level2index = 0;
+  }
+
+  level1file = metadata[level1index].name;
+  int startkey = atoi(metadata[level1index].smallestkey.c_str());
+  int endkey = atoi(metadata[level1index].largestkey.c_str());
+  level1keycount = (endkey - startkey + 1);
+  level2file = metadata[level2index].name;
+  startkey = atoi(metadata[level2index].smallestkey.c_str());
+  endkey = atoi(metadata[level2index].largestkey.c_str());
+  level2keycount = (endkey - startkey + 1);
+
+  // COntrolled setup. Levels 1 and 2 should both have 50K files.
+  // This is a little fragile as it depends on the current
+  // compaction heuristics.
+  ASSERT_EQ(level1keycount, 50000);
+  ASSERT_EQ(level2keycount, 50000);
+
+  Status status = db_->DeleteFile("0.sst");
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // intermediate level files cannot be deleted.
+  status = db_->DeleteFile(level1file);
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // Lowest level file deletion should succeed.
+  ASSERT_OK(db_->DeleteFile(level2file));
+
+  CloseDB();
+}
+
+TEST(DeleteFileTest, PurgeObsoleteFilesTest) {
+  CreateTwoLevels();
+  // there should be only one (empty) log file because CreateTwoLevels()
+  // flushes the memtables to disk
+  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+  // 2 ssts, 1 manifest
+  CheckFileTypeCounts(dbname_, 0, 2, 1);
+  std::string first("0"), last("999999");
+  Slice first_slice(first), last_slice(last);
+  db_->CompactRange(&first_slice, &last_slice, true, 2);
+  // 1 sst after compaction
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  // this time, we keep an iterator alive
+  ReopenDB(true);
+  Iterator *itr = 0;
+  CreateTwoLevels();
+  itr = db_->NewIterator(ReadOptions());
+  db_->CompactRange(&first_slice, &last_slice, true, 2);
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  delete itr;
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  CloseDB();
+}
+
+TEST(DeleteFileTest, DeleteFileWithIterator) {
+  CreateTwoLevels();
+  ReadOptions options;
+  Iterator* it = db_->NewIterator(options);
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level2file = "";
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 1) {
+    level2file = metadata[1].name;
+  } else {
+    level2file = metadata[0].name;
+  }
+
+  Status status = db_->DeleteFile(level2file);
+  fprintf(stdout, "Deletion status %s: %s\n",
+          level2file.c_str(), status.ToString().c_str());
+  ASSERT_TRUE(status.ok());
+  it->SeekToFirst();
+  int numKeysIterated = 0;
+  while(it->Valid()) {
+    numKeysIterated++;
+    it->Next();
+  }
+  ASSERT_EQ(numKeysIterated, 50000);
+  delete it;
+  CloseDB();
+}
+
+TEST(DeleteFileTest, DeleteLogFiles) {
+  AddKeys(10, 0);
+  VectorLogPtr logfiles;
+  db_->GetSortedWalFiles(logfiles);
+  ASSERT_GT(logfiles.size(), 0UL);
+  // Take the last log file which is expected to be alive and try to delete it
+  // Should not succeed because live logs are not allowed to be deleted
+  std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
+  ASSERT_EQ(alive_log->Type(), kAliveLogFile);
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  fprintf(stdout, "Deleting alive log file %s\n",
+          alive_log->PathName().c_str());
+  ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  logfiles.clear();
+
+  // Call Flush to bring about a new working log file and add more keys
+  // Call Flush again to flush out memtable and move alive log to archived log
+  // and try to delete the archived log file
+  FlushOptions fopts;
+  db_->Flush(fopts);
+  AddKeys(10, 0);
+  db_->Flush(fopts);
+  db_->GetSortedWalFiles(logfiles);
+  ASSERT_GT(logfiles.size(), 0UL);
+  std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
+  ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" +
+        archived_log->PathName()));
+  fprintf(stdout, "Deleting archived log file %s\n",
+          archived_log->PathName().c_str());
+  ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
+  ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" +
+        archived_log->PathName()));
+  CloseDB();
+}
+
+} //namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
+
--- a/db/file_indexer.cc
+++ b/db/file_indexer.cc
@@ -0,0 +1,202 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+#include <algorithm>
+#include "rocksdb/comparator.h"
+#include "db/version_edit.h"
+
+namespace rocksdb {
+
+FileIndexer::FileIndexer(const uint32_t num_levels,
+                         const Comparator* ucmp)
+  : num_levels_(num_levels),
+    ucmp_(ucmp),
+    next_level_index_(num_levels),
+    level_rb_(num_levels, -1) {
+}
+
+
+uint32_t FileIndexer::NumLevelIndex() {
+  return next_level_index_.size();
+}
+
+uint32_t FileIndexer::LevelIndexSize(uint32_t level) {
+  return next_level_index_[level].size();
+}
+
+void FileIndexer::GetNextLevelIndex(
+    const uint32_t level, const uint32_t file_index, const int cmp_smallest,
+    const int cmp_largest, int32_t* left_bound, int32_t* right_bound) {
+  assert(level > 0);
+
+  // Last level, no hint
+  if (level == num_levels_ - 1) {
+    *left_bound = 0;
+    *right_bound = -1;
+    return;
+  }
+
+  assert(level < num_levels_ - 1);
+  assert(static_cast<int32_t>(file_index) <= level_rb_[level]);
+
+  const auto& index = next_level_index_[level][file_index];
+
+  if (cmp_smallest < 0) {
+    *left_bound = (level > 0 && file_index > 0) ?
+      next_level_index_[level][file_index - 1].largest_lb : 0;
+    *right_bound = index.smallest_rb;
+  } else if (cmp_smallest == 0) {
+    *left_bound = index.smallest_lb;
+    *right_bound = index.smallest_rb;
+  } else if (cmp_smallest > 0 && cmp_largest < 0) {
+    *left_bound = index.smallest_lb;
+    *right_bound = index.largest_rb;
+  } else if (cmp_largest == 0) {
+    *left_bound = index.largest_lb;
+    *right_bound = index.largest_rb;
+  } else if (cmp_largest > 0) {
+    *left_bound = index.largest_lb;
+    *right_bound = level_rb_[level + 1];
+  } else {
+    assert(false);
+  }
+
+  assert(*left_bound >= 0);
+  assert(*left_bound <= *right_bound + 1);
+  assert(*right_bound <= level_rb_[level + 1]);
+}
+
+void FileIndexer::ClearIndex() {
+  for (uint32_t level = 1; level < num_levels_; ++level) {
+    next_level_index_[level].clear();
+  }
+}
+
+void FileIndexer::UpdateIndex(std::vector<FileMetaData*>* const files) {
+  if (files == nullptr) {
+    return;
+  }
+
+  // L1 - Ln-1
+  for (uint32_t level = 1; level < num_levels_ - 1; ++level) {
+    const auto& upper_files = files[level];
+    const int32_t upper_size = upper_files.size();
+    const auto& lower_files = files[level + 1];
+    level_rb_[level] = upper_files.size() - 1;
+    if (upper_size == 0) {
+      continue;
+    }
+    auto& index = next_level_index_[level];
+    index.resize(upper_size);
+
+    CalculateLB(upper_files, lower_files, &index,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->Compare(a->smallest.user_key(), b->largest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) {
+          index->smallest_lb = f_idx;
+        });
+    CalculateLB(upper_files, lower_files, &index,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->Compare(a->largest.user_key(), b->largest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) {
+          index->largest_lb = f_idx;
+        });
+    CalculateRB(upper_files, lower_files, &index,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->Compare(a->smallest.user_key(), b->smallest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) {
+          index->smallest_rb = f_idx;
+        });
+    CalculateRB(upper_files, lower_files, &index,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->Compare(a->largest.user_key(), b->smallest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) {
+          index->largest_rb = f_idx;
+        });
+  }
+  level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1;
+}
+
+void FileIndexer::CalculateLB(const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files,
+    std::vector<IndexUnit>* index,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index) {
+  const int32_t upper_size = upper_files.size();
+  const int32_t lower_size = lower_files.size();
+  int32_t upper_idx = 0;
+  int32_t lower_idx = 0;
+  while (upper_idx < upper_size && lower_idx < lower_size) {
+    int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+    if (cmp == 0) {
+      set_index(&(*index)[upper_idx], lower_idx);
+      ++upper_idx;
+      ++lower_idx;
+    } else if (cmp > 0) {
+      // Lower level's file (largest) is smaller, a key won't hit in that
+      // file. Move to next lower file
+      ++lower_idx;
+    } else {
+      // Lower level's file becomes larger, update the index, and
+      // move to the next upper file
+      set_index(&(*index)[upper_idx], lower_idx);
+      ++upper_idx;
+    }
+  }
+
+  while (upper_idx < upper_size) {
+    // Lower files are exhausted, that means the remaining upper files are
+    // greater than any lower files. Set the index to be the lower level size.
+    set_index(&(*index)[upper_idx], lower_size);
+    ++upper_idx;
+  }
+}
+
+void FileIndexer::CalculateRB(const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files,
+    std::vector<IndexUnit>* index,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index) {
+  const int32_t upper_size = upper_files.size();
+  const int32_t lower_size = lower_files.size();
+  int32_t upper_idx = upper_size - 1;
+  int32_t lower_idx = lower_size - 1;
+  while (upper_idx >= 0 && lower_idx >= 0) {
+    int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+    if (cmp == 0) {
+      set_index(&(*index)[upper_idx], lower_idx);
+      --upper_idx;
+      --lower_idx;
+    } else if (cmp < 0) {
+      // Lower level's file (smallest) is larger, a key won't hit in that
+      // file. Move to next lower file.
+      --lower_idx;
+    } else {
+      // Lower level's file becomes smaller, update the index, and move to
+      // the next the upper file
+      set_index(&(*index)[upper_idx], lower_idx);
+      --upper_idx;
+    }
+  }
+  while (upper_idx >= 0) {
+    // Lower files are exhausted, that means the remaining upper files are
+    // smaller than any lower files. Set it to -1.
+    set_index(&(*index)[upper_idx], -1);
+    --upper_idx;
+  }
+}
+
+}  // namespace rocksdb
--- a/db/file_indexer.h
+++ b/db/file_indexer.h
@@ -0,0 +1,129 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <vector>
+
+namespace rocksdb {
+
+class Comparator;
+struct FileMetaData;
+
+// The file tree structure in Version is prebuilt and the range of each file
+// is known. On Version::Get(), it uses binary search to find a potential file
+// and then check if a target key can be found in the file by comparing the key
+// to each file's smallest and largest key. The results of these comparisions
+// can be reused beyond checking if a key falls into a file's range.
+// With some pre-calculated knowledge, each key comparision that has been done
+// can serve as a hint to narrow down further searches: if a key compared to
+// be smaller than a file's smallest or largest, that comparison can be used
+// to find out the right bound of next binary search. Similarly, if a key
+// compared to be larger than a file's smallest or largest, it can be utilized
+// to find out the left bound of next binary search.
+// With these hints: it can greatly reduce the range of binary search,
+// especially for bottom levels, given that one file most likely overlaps with
+// only N files from level below (where N is max_bytes_for_level_multiplier).
+// So on level L, we will only look at ~N files instead of N^L files on the
+// naive approach.
+class FileIndexer {
+ public:
+  FileIndexer(const uint32_t num_levels, const Comparator* ucmp);
+
+  uint32_t NumLevelIndex();
+
+  uint32_t LevelIndexSize(uint32_t level);
+
+  // Return a file index range in the next level to search for a key based on
+  // smallest and largest key comparision for the current file specified by
+  // level and file_index. When *left_index < *right_index, both index should
+  // be valid and fit in the vector size.
+  void GetNextLevelIndex(
+    const uint32_t level, const uint32_t file_index, const int cmp_smallest,
+    const int cmp_largest, int32_t* left_bound, int32_t* right_bound);
+
+  void ClearIndex();
+
+  void UpdateIndex(std::vector<FileMetaData*>* const files);
+
+  enum {
+    kLevelMaxIndex = std::numeric_limits<int32_t>::max()
+  };
+
+ private:
+  const uint32_t num_levels_;
+  const Comparator* ucmp_;
+
+  struct IndexUnit {
+    IndexUnit()
+      : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {}
+    // During file search, a key is compared against smallest and largest
+    // from a FileMetaData. It can have 3 possible outcomes:
+    // (1) key is smaller than smallest, implying it is also smaller than
+    //     larger. Precalculated index based on "smallest < smallest" can
+    //     be used to provide right bound.
+    // (2) key is in between smallest and largest.
+    //     Precalculated index based on "smallest > greatest" can be used to
+    //     provide left bound.
+    //     Precalculated index based on "largest < smallest" can be used to
+    //     provide right bound.
+    // (3) key is larger than largest, implying it is also larger than smallest.
+    //     Precalculated index based on "largest > largest" can be used to
+    //     provide left bound.
+    //
+    // As a result, we will need to do:
+    // Compare smallest (<=) and largest keys from upper level file with
+    // smallest key from lower level to get a right bound.
+    // Compare smallest (>=) and largest keys from upper level file with
+    // largest key from lower level to get a left bound.
+    //
+    // Example:
+    //    level 1:              [50 - 60]
+    //    level 2:        [1 - 40], [45 - 55], [58 - 80]
+    // A key 35, compared to be less than 50, 3rd file on level 2 can be
+    // skipped according to rule (1). LB = 0, RB = 1.
+    // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be
+    // skipped according to rule (2)-a, but the 3rd file cannot be skipped
+    // because 60 is greater than 58. LB = 1, RB = 2.
+    // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped
+    // according to rule (3). LB = 2, RB = 2.
+    //
+    // Point to a left most file in a lower level that may contain a key,
+    // which compares greater than smallest of a FileMetaData (upper level)
+    int32_t smallest_lb;
+    // Point to a left most file in a lower level that may contain a key,
+    // which compares greater than largest of a FileMetaData (upper level)
+    int32_t largest_lb;
+    // Point to a right most file in a lower level that may contain a key,
+    // which compares smaller than smallest of a FileMetaData (upper level)
+    int32_t smallest_rb;
+    // Point to a right most file in a lower level that may contain a key,
+    // which compares smaller than largest of a FileMetaData (upper level)
+    int32_t largest_rb;
+  };
+
+  void CalculateLB(const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files,
+    std::vector<IndexUnit>* index,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index);
+
+  void CalculateRB(const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files,
+    std::vector<IndexUnit>* index,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index);
+
+  std::vector<std::vector<IndexUnit>> next_level_index_;
+  std::vector<int32_t> level_rb_;
+};
+
+}  // namespace rocksdb
--- a/db/file_indexer_test.cc
+++ b/db/file_indexer_test.cc
@@ -0,0 +1,330 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string>
+#include "db/file_indexer.h"
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class IntComparator : public Comparator {
+ public:
+  int Compare(const Slice& a, const Slice& b) const {
+    assert(a.size() == 8);
+    assert(b.size() == 8);
+    return *reinterpret_cast<const int64_t*>(a.data()) -
+      *reinterpret_cast<const int64_t*>(b.data());
+  }
+
+  const char* Name() const {
+    return "IntComparator";
+  }
+
+  void FindShortestSeparator(std::string* start, const Slice& limit) const {}
+
+  void FindShortSuccessor(std::string* key) const {}
+};
+
+
+struct FileIndexerTest {
+ public:
+  FileIndexerTest() :
+    kNumLevels(4), indexer(kNumLevels, &ucmp),
+    files(new std::vector<FileMetaData*>[kNumLevels]) {
+  }
+
+  ~FileIndexerTest() {
+    Reset();
+    delete[] files;
+  }
+
+  void AddFile(int level, int64_t smallest, int64_t largest) {
+    auto* f = new FileMetaData();
+    f->smallest = IntKey(smallest);
+    f->largest = IntKey(largest);
+    files[level].push_back(f);
+  }
+
+  InternalKey IntKey(int64_t v) {
+    return InternalKey(Slice(reinterpret_cast<char*>(&v), 8), 0, kTypeValue);
+  }
+
+  void Reset() {
+    for (uint32_t i = 0; i < kNumLevels; ++i) {
+      for (auto* f : files[i]) {
+        delete f;
+      }
+      files[i].clear();
+    }
+    indexer.ClearIndex();
+  }
+
+  void GetNextLevelIndex(const uint32_t level, const uint32_t file_index,
+      const int cmp_smallest, const int cmp_largest, int32_t* left_index,
+      int32_t* right_index) {
+    *left_index = 100;
+    *right_index = 100;
+    indexer.GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
+        left_index, right_index);
+  }
+
+  const uint32_t kNumLevels;
+  IntComparator ucmp;
+  FileIndexer indexer;
+
+  std::vector<FileMetaData*>* files;
+};
+
+TEST(FileIndexerTest, next_level_hint) {
+  for (uint32_t i = 0; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
+  }
+
+  // Case 1: no overlap, files are on the left of next level files
+  // level 1
+  AddFile(1, 100, 200);
+  AddFile(1, 300, 400);
+  AddFile(1, 500, 600);
+  // level 2
+  AddFile(2, 1500, 1600);
+  AddFile(2, 1601, 1699);
+  AddFile(2, 1700, 1800);
+  // level 3
+  AddFile(3, 2500, 2600);
+  AddFile(3, 2601, 2699);
+  AddFile(3, 2700, 2800);
+  indexer.UpdateIndex(files);
+  int32_t left = 100;
+  int32_t right = 100;
+  for (uint32_t level = 1; level < 3; ++level) {
+    for (uint32_t f = 0; f < 3; ++f) {
+      GetNextLevelIndex(level, f, -1, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 0, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, 0, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, 1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(2, right);
+    }
+  }
+
+  // Case 2: no overlap, files are on the right of next level files
+  Reset();
+  for (uint32_t i = 1; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
+  }
+  // level 1
+  AddFile(1, 2100, 2200);
+  AddFile(1, 2300, 2400);
+  AddFile(1, 2500, 2600);
+  // level 2
+  AddFile(2, 1500, 1600);
+  AddFile(2, 1501, 1699);
+  AddFile(2, 1700, 1800);
+  // level 3
+  AddFile(3, 500, 600);
+  AddFile(3, 501, 699);
+  AddFile(3, 700, 800);
+  indexer.UpdateIndex(files);
+  for (uint32_t level = 1; level < 3; ++level) {
+    for (uint32_t f = 0; f < 3; ++f) {
+      GetNextLevelIndex(level, f, -1, -1, &left, &right);
+      ASSERT_EQ(f == 0 ? 0 : 3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 0, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, 0, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, 1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+    }
+  }
+
+  // Case 3: empty L2
+  Reset();
+  for (uint32_t i = 1; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
+  }
+  // level 1
+  AddFile(1, 2100, 2200);
+  AddFile(1, 2300, 2400);
+  AddFile(1, 2500, 2600);
+  // level 3
+  AddFile(3, 500, 600);
+  AddFile(3, 501, 699);
+  AddFile(3, 700, 800);
+  indexer.UpdateIndex(files);
+  for (uint32_t f = 0; f < 3; ++f) {
+    GetNextLevelIndex(1, f, -1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 0, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, 0, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, 1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+  }
+
+
+  // Case 4: mixed
+  Reset();
+  for (uint32_t i = 1; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer.LevelIndexSize(i));
+  }
+  // level 1
+  AddFile(1, 100, 200);
+  AddFile(1, 250, 400);
+  AddFile(1, 450, 500);
+  // level 2
+  AddFile(2, 100, 150);  // 0
+  AddFile(2, 200, 250);  // 1
+  AddFile(2, 251, 300);  // 2
+  AddFile(2, 301, 350);  // 3
+  AddFile(2, 500, 600);  // 4
+  // level 3
+  AddFile(3, 0, 50);
+  AddFile(3, 100, 200);
+  AddFile(3, 201, 250);
+  indexer.UpdateIndex(files);
+  // level 1, 0
+  GetNextLevelIndex(1, 0, -1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(0, right);
+  GetNextLevelIndex(1, 0, 0, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(0, right);
+  GetNextLevelIndex(1, 0, 1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 0, 1, 0, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 0, 1, 1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(4, right);
+  // level 1, 1
+  GetNextLevelIndex(1, 1, -1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 1, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 1, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 1, 1, 0, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 1, 1, 1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  // level 1, 2
+  GetNextLevelIndex(1, 2, -1, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 2, 0, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 2, 1, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  GetNextLevelIndex(1, 2, 1, 0, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  GetNextLevelIndex(1, 2, 1, 1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  // level 2, 0
+  GetNextLevelIndex(2, 0, -1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, 0, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, 1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(2, right);
+  // level 2, 1
+  GetNextLevelIndex(2, 1, -1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 1, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 1, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(2, right);
+  GetNextLevelIndex(2, 1, 1, 0, &left, &right);
+  ASSERT_EQ(2, left);
+  ASSERT_EQ(2, right);
+  GetNextLevelIndex(2, 1, 1, 1, &left, &right);
+  ASSERT_EQ(2, left);
+  ASSERT_EQ(2, right);
+  // level 2, [2 - 4], no overlap
+  for (uint32_t f = 2; f <= 4; ++f) {
+    GetNextLevelIndex(2, f, -1, -1, &left, &right);
+    ASSERT_EQ(f == 2 ? 2 : 3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 0, -1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, -1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, 0, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, 1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/filename.cc
+++ b/db/filename.cc
@@ -0,0 +1,266 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+// Given a path, flatten the path name by replacing all chars not in
+// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end.
+// Return the number of chars stored in dest not including the trailing '\0'.
+static int FlattenPath(const std::string& path, char* dest, int len) {
+  int write_idx = 0;
+  int i = 0;
+  int src_len = path.size();
+
+  while (i < src_len && write_idx < len - 1) {
+    if ((path[i] >= 'a' && path[i] <= 'z') ||
+        (path[i] >= '0' && path[i] <= '9') ||
+        (path[i] >= 'A' && path[i] <= 'Z') ||
+        path[i] == '-' ||
+        path[i] == '.' ||
+        path[i] == '_'){
+      dest[write_idx++] = path[i];
+    } else {
+      if (i > 0)
+        dest[write_idx++] = '_';
+    }
+    i++;
+  }
+
+  dest[write_idx] = '\0';
+  return write_idx;
+}
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/%06llu.%s",
+           static_cast<unsigned long long>(number),
+           suffix);
+  return name + buf;
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "log");
+}
+
+std::string ArchivalDirectory(const std::string& dir) {
+  return dir + "/" + ARCHIVAL_DIR;
+}
+std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
+}
+
+std::string TableFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "sst");
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  assert(number > 0);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+  return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+  return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+  return MakeFileName(dbname, number, "dbtmp");
+}
+
+std::string InfoLogFileName(const std::string& dbname,
+    const std::string& db_path, const std::string& log_dir) {
+  if (log_dir.empty())
+    return dbname + "/LOG";
+
+  char flatten_db_path[256];
+  FlattenPath(db_path, flatten_db_path, 256);
+  return log_dir + "/" + flatten_db_path + "_LOG";
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+    const std::string& db_path, const std::string& log_dir) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
+
+  if (log_dir.empty())
+    return dbname + "/LOG.old." + buf;
+
+  char flatten_db_path[256];
+  FlattenPath(db_path, flatten_db_path, 256);
+  return log_dir + "/" + flatten_db_path + "_LOG.old." + buf;
+}
+
+std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/METADB-%llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string IdentityFileName(const std::string& dbname) {
+  return dbname + "/IDENTITY";
+}
+
+// Owned filenames have the form:
+//    dbname/IDENTITY
+//    dbname/CURRENT
+//    dbname/LOCK
+//    dbname/LOG
+//    dbname/LOG.old.[0-9]+
+//    dbname/MANIFEST-[0-9]+
+//    dbname/[0-9]+.(log|sst)
+//    dbname/METADB-[0-9]+
+//    Disregards / at the beginning
+bool ParseFileName(const std::string& fname,
+                   uint64_t* number,
+                   FileType* type,
+                   WalFileType* log_type) {
+  Slice rest(fname);
+  if (fname.length() > 1 && fname[0] == '/') {
+    rest.remove_prefix(1);
+  }
+  if (rest == "IDENTITY") {
+    *number = 0;
+    *type = kIdentityFile;
+  } else if (rest == "CURRENT") {
+    *number = 0;
+    *type = kCurrentFile;
+  } else if (rest == "LOCK") {
+    *number = 0;
+    *type = kDBLockFile;
+  } else if (rest == "LOG" || rest == "LOG.old") {
+    *number = 0;
+    *type = kInfoLogFile;
+  } else if (rest.starts_with("LOG.old.")) {
+    uint64_t ts_suffix;
+    // sizeof also counts the trailing '\0'.
+    rest.remove_prefix(sizeof("LOG.old.") - 1);
+    if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+      return false;
+    }
+    *number = ts_suffix;
+    *type = kInfoLogFile;
+  } else if (rest.starts_with("MANIFEST-")) {
+    rest.remove_prefix(strlen("MANIFEST-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kDescriptorFile;
+    *number = num;
+  } else if (rest.starts_with("METADB-")) {
+    rest.remove_prefix(strlen("METADB-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kMetaDatabase;
+    *number = num;
+  } else {
+    // Avoid strtoull() to keep filename format independent of the
+    // current locale
+    bool archive_dir_found = false;
+    if (rest.starts_with(ARCHIVAL_DIR)) {
+      if (rest.size() <= ARCHIVAL_DIR.size()) {
+        return false;
+      }
+      rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
+      if (log_type) {
+        *log_type = kArchivedLogFile;
+      }
+      archive_dir_found = true;
+    }
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    Slice suffix = rest;
+    if (suffix == Slice(".log")) {
+      *type = kLogFile;
+      if (log_type && !archive_dir_found) {
+        *log_type = kAliveLogFile;
+      }
+    } else if (archive_dir_found) {
+      return false; // Archive dir can contain only log files
+    } else if (suffix == Slice(".sst")) {
+      *type = kTableFile;
+    } else if (suffix == Slice(".dbtmp")) {
+      *type = kTempFile;
+    } else {
+      return false;
+    }
+    *number = num;
+  }
+  return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+                      uint64_t descriptor_number,
+                      Directory* directory_to_fsync) {
+  // Remove leading "dbname/" and add newline to manifest file name
+  std::string manifest = DescriptorFileName(dbname, descriptor_number);
+  Slice contents = manifest;
+  assert(contents.starts_with(dbname + "/"));
+  contents.remove_prefix(dbname.size() + 1);
+  std::string tmp = TempFileName(dbname, descriptor_number);
+  Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, CurrentFileName(dbname));
+  }
+  if (s.ok()) {
+    if (directory_to_fsync != nullptr) {
+      directory_to_fsync->Fsync();
+    }
+  } else {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+Status SetIdentityFile(Env* env, const std::string& dbname) {
+  std::string id = env->GenerateUniqueId();
+  assert(!id.empty());
+  // Reserve the filename dbname/000000.dbtmp for the temporary identity file
+  std::string tmp = TempFileName(dbname, 0);
+  Status s = WriteStringToFile(env, id, tmp, true);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, IdentityFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+}  // namespace rocksdb
--- a/db/filename.h
+++ b/db/filename.h
@@ -0,0 +1,110 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+class Env;
+class Directory;
+
+enum FileType {
+  kLogFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+  kMetaDatabase,
+  kIdentityFile
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+static const std::string ARCHIVAL_DIR = "archive";
+
+extern std::string ArchivalDirectory(const std::string& dbname);
+
+//  Return the name of the archived log file with the specified number
+//  in the db named by "dbname". The result will be prefixed with "dbname".
+extern std::string ArchivedLogFileName(const std::string& dbname,
+                                       uint64_t num);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number.  The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+                                      uint64_t number);
+
+// Return the name of the current file.  This file contains the name
+// of the current manifest file.  The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname,
+    const std::string& db_path="", const std::string& log_dir="");
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+    const std::string& db_path="", const std::string& log_dir="");
+
+// Return the name to use for a metadatabase. The result will be prefixed with
+// "dbname".
+extern std::string MetaDatabaseName(const std::string& dbname,
+                                    uint64_t number);
+
+// Return the name of the Identity file which stores a unique number for the db
+// that will get regenerated if the db loses all its data and is recreated fresh
+// either from a backup-image or empty
+extern std::string IdentityFileName(const std::string& dbname);
+
+// If filename is a rocksdb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number.  If the
+// filename was successfully parsed, returns true.  Else return false.
+extern bool ParseFileName(const std::string& filename,
+                          uint64_t* number,
+                          FileType* type,
+                          WalFileType* log_type = nullptr);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+                             uint64_t descriptor_number,
+                             Directory* directory_to_fsync);
+
+// Make the IDENTITY file for the db
+extern Status SetIdentityFile(Env* env, const std::string& dbname);
+
+}  // namespace rocksdb
--- a/db/filename_test.cc
+++ b/db/filename_test.cc
@@ -0,0 +1,140 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class FileNameTest { };
+
+TEST(FileNameTest, Parse) {
+  Slice db;
+  FileType type;
+  uint64_t number;
+
+  // Successful parses
+  static struct {
+    const char* fname;
+    uint64_t number;
+    FileType type;
+  } cases[] = {
+    { "100.log",            100,   kLogFile },
+    { "0.log",              0,     kLogFile },
+    { "0.sst",              0,     kTableFile },
+    { "CURRENT",            0,     kCurrentFile },
+    { "LOCK",               0,     kDBLockFile },
+    { "MANIFEST-2",         2,     kDescriptorFile },
+    { "MANIFEST-7",         7,     kDescriptorFile },
+    { "METADB-2",           2,     kMetaDatabase },
+    { "METADB-7",           7,     kMetaDatabase },
+    { "LOG",                0,     kInfoLogFile },
+    { "LOG.old",            0,     kInfoLogFile },
+    { "18446744073709551615.log", 18446744073709551615ull, kLogFile },
+  };
+  for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    std::string f = cases[i].fname;
+    ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+    ASSERT_EQ(cases[i].type, type) << f;
+    ASSERT_EQ(cases[i].number, number) << f;
+  }
+
+  // Errors
+  static const char* errors[] = {
+    "",
+    "foo",
+    "foo-dx-100.log",
+    ".log",
+    "",
+    "manifest",
+    "CURREN",
+    "CURRENTX",
+    "MANIFES",
+    "MANIFEST",
+    "MANIFEST-",
+    "XMANIFEST-3",
+    "MANIFEST-3x",
+    "META",
+    "METADB",
+    "METADB-",
+    "XMETADB-3",
+    "METADB-3x",
+    "LOC",
+    "LOCKx",
+    "LO",
+    "LOGx",
+    "18446744073709551616.log",
+    "184467440737095516150.log",
+    "100",
+    "100.",
+    "100.lop"
+  };
+  for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+    std::string f = errors[i];
+    ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+  };
+}
+
+TEST(FileNameTest, Construction) {
+  uint64_t number;
+  FileType type;
+  std::string fname;
+
+  fname = CurrentFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kCurrentFile, type);
+
+  fname = LockFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kDBLockFile, type);
+
+  fname = LogFileName("foo", 192);
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(192U, number);
+  ASSERT_EQ(kLogFile, type);
+
+  fname = TableFileName("bar", 200);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(200U, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = DescriptorFileName("bar", 100);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kDescriptorFile, type);
+
+  fname = TempFileName("tmp", 999);
+  ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(999U, number);
+  ASSERT_EQ(kTempFile, type);
+
+  fname = MetaDatabaseName("met", 100);
+  ASSERT_EQ("met/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kMetaDatabase, type);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -0,0 +1,383 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "db/forward_iterator.h"
+
+#include <string>
+#include <utility>
+#include <limits>
+#include "db/db_impl.h"
+#include "db/db_iter.h"
+#include "db/column_family.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merger.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+// Usage:
+//     LevelIterator iter;
+//     iter.SetFileIndex(file_index);
+//     iter.Seek(target);
+//     iter.Next()
+class LevelIterator : public Iterator {
+ public:
+  LevelIterator(const ColumnFamilyData* const cfd,
+      const ReadOptions& read_options,
+      const std::vector<FileMetaData*>& files)
+    : cfd_(cfd), read_options_(read_options), files_(files), valid_(false),
+      file_index_(std::numeric_limits<uint32_t>::max()) {}
+
+  void SetFileIndex(uint32_t file_index) {
+    assert(file_index < files_.size());
+    if (file_index != file_index_) {
+      file_index_ = file_index;
+      file_iter_.reset(cfd_->table_cache()->NewIterator(
+          read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
+          *(files_[file_index_]), nullptr /* table_reader_ptr */, false));
+    }
+    valid_ = false;
+  }
+  void SeekToLast() override {
+    status_ = Status::NotSupported("LevelIterator::SeekToLast()");
+    valid_ = false;
+  }
+  void Prev() {
+    status_ = Status::NotSupported("LevelIterator::Prev()");
+    valid_ = false;
+  }
+  bool Valid() const override {
+    return valid_;
+  }
+  void SeekToFirst() override {
+    SetFileIndex(0);
+    file_iter_->SeekToFirst();
+    valid_ = file_iter_->Valid();
+  }
+  void Seek(const Slice& internal_key) override {
+    assert(file_iter_ != nullptr);
+    file_iter_->Seek(internal_key);
+    valid_ = file_iter_->Valid();
+    assert(valid_);
+  }
+  void Next() override {
+    assert(valid_);
+    file_iter_->Next();
+    while (!file_iter_->Valid()) {
+      if (file_index_ + 1 >= files_.size()) {
+        valid_ = false;
+        return;
+      }
+      SetFileIndex(file_index_ + 1);
+      file_iter_->SeekToFirst();
+    }
+    valid_ = file_iter_->Valid();
+  }
+  Slice key() const override {
+    assert(valid_);
+    return file_iter_->key();
+  }
+  Slice value() const override {
+    assert(valid_);
+    return file_iter_->value();
+  }
+  Status status() const override {
+    return status_;
+  }
+
+ private:
+  const ColumnFamilyData* const cfd_;
+  const ReadOptions& read_options_;
+  const std::vector<FileMetaData*>& files_;
+
+  bool valid_;
+  uint32_t file_index_;
+  Status status_;
+  std::unique_ptr<Iterator> file_iter_;
+};
+
+ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+                                 ColumnFamilyData* cfd)
+    : db_(db),
+      read_options_(read_options),
+      cfd_(cfd),
+      prefix_extractor_(cfd->options()->prefix_extractor.get()),
+      user_comparator_(cfd->user_comparator()),
+      immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
+      sv_(nullptr),
+      mutable_iter_(nullptr),
+      current_(nullptr),
+      valid_(false),
+      is_prev_set_(false) {}
+
+ForwardIterator::~ForwardIterator() {
+  Cleanup();
+}
+
+void ForwardIterator::Cleanup() {
+  delete mutable_iter_;
+  for (auto* m : imm_iters_) {
+    delete m;
+  }
+  imm_iters_.clear();
+  for (auto* f : l0_iters_) {
+    delete f;
+  }
+  l0_iters_.clear();
+  for (auto* l : level_iters_) {
+    delete l;
+  }
+  level_iters_.clear();
+
+  if (sv_ != nullptr && sv_->Unref()) {
+    DBImpl::DeletionState deletion_state;
+    db_->mutex_.Lock();
+    sv_->Cleanup();
+    db_->FindObsoleteFiles(deletion_state, false, true);
+    db_->mutex_.Unlock();
+    delete sv_;
+    if (deletion_state.HaveSomethingToDelete()) {
+      db_->PurgeObsoleteFiles(deletion_state);
+    }
+  }
+}
+
+bool ForwardIterator::Valid() const {
+  return valid_;
+}
+
+void ForwardIterator::SeekToFirst() {
+  if (sv_ == nullptr ||
+      sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
+    RebuildIterators();
+  }
+  SeekInternal(Slice(), true);
+}
+
+void ForwardIterator::Seek(const Slice& internal_key) {
+  if (sv_ == nullptr ||
+      sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
+    RebuildIterators();
+  }
+  SeekInternal(internal_key, false);
+}
+
+void ForwardIterator::SeekInternal(const Slice& internal_key,
+                                   bool seek_to_first) {
+  // mutable
+  seek_to_first ? mutable_iter_->SeekToFirst() :
+                  mutable_iter_->Seek(internal_key);
+
+  // immutable
+  // TODO(ljin): NeedToSeekImmutable has negative impact on performance
+  // if it turns to need to seek immutable often. We probably want to have
+  // an option to turn it off.
+  if (seek_to_first || NeedToSeekImmutable(internal_key)) {
+    {
+      auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
+      immutable_min_heap_.swap(tmp);
+    }
+    for (auto* m : imm_iters_) {
+      seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
+      if (m->Valid()) {
+        immutable_min_heap_.push(m);
+      }
+    }
+
+    auto* files = sv_->current->files_;
+    for (uint32_t i = 0; i < files[0].size(); ++i) {
+      if (seek_to_first) {
+        l0_iters_[i]->SeekToFirst();
+      } else {
+        // If the target key passes over the larget key, we are sure Next()
+        // won't go over this file.
+        if (user_comparator_->Compare(ExtractUserKey(internal_key),
+              files[0][i]->largest.user_key()) > 0) {
+          continue;
+        }
+        l0_iters_[i]->Seek(internal_key);
+      }
+      if (l0_iters_[i]->Valid()) {
+        immutable_min_heap_.push(l0_iters_[i]);
+      }
+    }
+    for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
+      if (files[level].empty()) {
+        continue;
+      }
+      assert(level_iters_[level - 1] != nullptr);
+      uint32_t f_idx = 0;
+      if (!seek_to_first) {
+        f_idx = FindFileInRange(
+            files[level], internal_key, 0, files[level].size());
+      }
+      if (f_idx < files[level].size()) {
+        level_iters_[level - 1]->SetFileIndex(f_idx);
+        seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
+                        level_iters_[level - 1]->Seek(internal_key);
+        if (level_iters_[level - 1]->Valid()) {
+          immutable_min_heap_.push(level_iters_[level - 1]);
+        }
+      }
+    }
+
+    if (seek_to_first || immutable_min_heap_.empty()) {
+      is_prev_set_ = false;
+    } else {
+      prev_key_.SetKey(internal_key);
+      is_prev_set_ = true;
+    }
+  }
+
+  UpdateCurrent();
+}
+
+void ForwardIterator::Next() {
+  assert(valid_);
+
+  if (sv_ == nullptr ||
+      sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
+    std::string current_key = key().ToString();
+    Slice old_key(current_key.data(), current_key.size());
+
+    RebuildIterators();
+    SeekInternal(old_key, false);
+    if (!valid_ || key().compare(old_key) != 0) {
+      return;
+    }
+  } else if (current_ != mutable_iter_) {
+    // It is going to advance immutable iterator
+    prev_key_.SetKey(current_->key());
+    is_prev_set_ = true;
+  }
+
+  current_->Next();
+  if (current_->Valid() && current_ != mutable_iter_) {
+    immutable_min_heap_.push(current_);
+  }
+  UpdateCurrent();
+}
+
+Slice ForwardIterator::key() const {
+  assert(valid_);
+  return current_->key();
+}
+
+Slice ForwardIterator::value() const {
+  assert(valid_);
+  return current_->value();
+}
+
+Status ForwardIterator::status() const {
+  if (!status_.ok()) {
+    return status_;
+  } else if (!mutable_iter_->status().ok()) {
+    return mutable_iter_->status();
+  }
+  return Status::OK();
+}
+
+void ForwardIterator::RebuildIterators() {
+  // Clean up
+  Cleanup();
+  // New
+  sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
+  mutable_iter_ = sv_->mem->NewIterator(read_options_);
+  sv_->imm->AddIterators(read_options_, &imm_iters_);
+  const auto& l0_files = sv_->current->files_[0];
+  l0_iters_.reserve(l0_files.size());
+  for (const auto* l0 : l0_files) {
+    l0_iters_.push_back(cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0));
+  }
+  level_iters_.reserve(sv_->current->NumberLevels() - 1);
+  for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
+    if (sv_->current->files_[level].empty()) {
+      level_iters_.push_back(nullptr);
+    } else {
+      level_iters_.push_back(new LevelIterator(cfd_, read_options_,
+          sv_->current->files_[level]));
+    }
+  }
+
+  current_ = nullptr;
+  is_prev_set_ = false;
+}
+
+void ForwardIterator::UpdateCurrent() {
+  if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
+    current_ = nullptr;
+  } else if (immutable_min_heap_.empty()) {
+    current_ = mutable_iter_;
+  } else if (!mutable_iter_->Valid()) {
+    current_ = immutable_min_heap_.top();
+    immutable_min_heap_.pop();
+  } else {
+    current_ = immutable_min_heap_.top();
+    assert(current_ != nullptr);
+    assert(current_->Valid());
+    int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare(
+        mutable_iter_->key(), current_->key()) > 0;
+    assert(cmp != 0);
+    if (cmp > 0) {
+      immutable_min_heap_.pop();
+    } else {
+      current_ = mutable_iter_;
+    }
+  }
+  valid_ = (current_ != nullptr);
+  if (!status_.ok()) {
+    status_ = Status::OK();
+  }
+}
+
+bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
+  if (!is_prev_set_) {
+    return true;
+  }
+  Slice prev_key = prev_key_.GetKey();
+  if (prefix_extractor_ && prefix_extractor_->Transform(target).compare(
+    prefix_extractor_->Transform(prev_key)) != 0) {
+    return true;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+        prev_key, target) >= 0) {
+    return true;
+  }
+  if (immutable_min_heap_.empty() ||
+      cfd_->internal_comparator().InternalKeyComparator::Compare(
+          target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
+                                            : current_->key()) > 0) {
+    return true;
+  }
+  return false;
+}
+
+uint32_t ForwardIterator::FindFileInRange(
+    const std::vector<FileMetaData*>& files, const Slice& internal_key,
+    uint32_t left, uint32_t right) {
+  while (left < right) {
+    uint32_t mid = (left + right) / 2;
+    const FileMetaData* f = files[mid];
+    if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+          f->largest.Encode(), internal_key) < 0) {
+      // Key at "mid.largest" is < "target".  Therefore all
+      // files at or before "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "mid.largest" is >= "target".  Therefore all files
+      // after "mid" are uninteresting.
+      right = mid;
+    }
+  }
+  return right;
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@@ -0,0 +1,105 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include <queue>
+
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+class DBImpl;
+class Env;
+struct SuperVersion;
+class ColumnFamilyData;
+class LevelIterator;
+struct FileMetaData;
+
+class MinIterComparator {
+ public:
+  explicit MinIterComparator(const Comparator* comparator) :
+    comparator_(comparator) {}
+
+  bool operator()(Iterator* a, Iterator* b) {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+ private:
+  const Comparator* comparator_;
+};
+
+typedef std::priority_queue<Iterator*,
+          std::vector<Iterator*>,
+          MinIterComparator> MinIterHeap;
+
+/**
+ * ForwardIterator is a special type of iterator that only supports Seek()
+ * and Next(). It is expected to perform better than TailingIterator by
+ * removing the encapsulation and making all information accessible within
+ * the iterator. At the current implementation, snapshot is taken at the
+ * time Seek() is called. The Next() followed do not see new values after.
+ */
+class ForwardIterator : public Iterator {
+ public:
+  ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+                  ColumnFamilyData* cfd);
+  virtual ~ForwardIterator();
+
+  void SeekToLast() override {
+    status_ = Status::NotSupported("ForwardIterator::SeekToLast()");
+    valid_ = false;
+  }
+  void Prev() {
+    status_ = Status::NotSupported("ForwardIterator::Prev");
+    valid_ = false;
+  }
+
+  virtual bool Valid() const override;
+  void SeekToFirst() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void Next() override;
+  virtual Slice key() const override;
+  virtual Slice value() const override;
+  virtual Status status() const override;
+
+ private:
+  void Cleanup();
+  void RebuildIterators();
+  void SeekInternal(const Slice& internal_key, bool seek_to_first);
+  void UpdateCurrent();
+  bool NeedToSeekImmutable(const Slice& internal_key);
+  uint32_t FindFileInRange(
+    const std::vector<FileMetaData*>& files, const Slice& internal_key,
+    uint32_t left, uint32_t right);
+
+  DBImpl* const db_;
+  const ReadOptions read_options_;
+  ColumnFamilyData* const cfd_;
+  const SliceTransform* const prefix_extractor_;
+  const Comparator* user_comparator_;
+  MinIterHeap immutable_min_heap_;
+
+  SuperVersion* sv_;
+  Iterator* mutable_iter_;
+  std::vector<Iterator*> imm_iters_;
+  std::vector<Iterator*> l0_iters_;
+  std::vector<LevelIterator*> level_iters_;
+  Iterator* current_;
+  // internal iterator status
+  Status status_;
+  bool valid_;
+
+  IterKey prev_key_;
+  bool is_prev_set_;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -0,0 +1,369 @@
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/internal_stats.h"
+#include "db/column_family.h"
+
+#include <vector>
+
+namespace rocksdb {
+
+DBPropertyType GetPropertyType(const Slice& property) {
+  Slice in = property;
+  Slice prefix("rocksdb.");
+  if (!in.starts_with(prefix)) return kUnknown;
+  in.remove_prefix(prefix.size());
+
+  if (in.starts_with("num-files-at-level")) {
+    return kNumFilesAtLevel;
+  } else if (in == "levelstats") {
+    return kLevelStats;
+  } else if (in == "stats") {
+    return kStats;
+  } else if (in == "sstables") {
+    return kSsTables;
+  } else if (in == "num-immutable-mem-table") {
+    return kNumImmutableMemTable;
+  } else if (in == "mem-table-flush-pending") {
+    return kMemtableFlushPending;
+  } else if (in == "compaction-pending") {
+    return kCompactionPending;
+  } else if (in == "background-errors") {
+    return kBackgroundErrors;
+  } else if (in == "cur-size-active-mem-table") {
+    return kCurSizeActiveMemTable;
+  } else if (in == "num-entries-active-mem-table") {
+    return kNumEntriesInMutableMemtable;
+  } else if (in == "num-entries-imm-mem-tables") {
+    return kNumEntriesInImmutableMemtable;
+  }
+  return kUnknown;
+}
+
+bool InternalStats::GetProperty(DBPropertyType property_type,
+                                const Slice& property, std::string* value,
+                                ColumnFamilyData* cfd) {
+  Version* current = cfd->current();
+  Slice in = property;
+
+  switch (property_type) {
+    case kNumFilesAtLevel: {
+      in.remove_prefix(strlen("rocksdb.num-files-at-level"));
+      uint64_t level;
+      bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
+      if (!ok || (int)level >= number_levels_) {
+        return false;
+      } else {
+        char buf[100];
+        snprintf(buf, sizeof(buf), "%d",
+                 current->NumLevelFiles(static_cast<int>(level)));
+        *value = buf;
+        return true;
+      }
+    }
+    case kLevelStats: {
+      char buf[1000];
+      snprintf(buf, sizeof(buf),
+               "Level Files Size(MB)\n"
+               "--------------------\n");
+      value->append(buf);
+
+      for (int level = 0; level < number_levels_; level++) {
+        snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
+                 current->NumLevelFiles(level),
+                 current->NumLevelBytes(level) / 1048576.0);
+        value->append(buf);
+      }
+      return true;
+    }
+    case kStats: {
+      char buf[1000];
+
+      uint64_t wal_bytes = 0;
+      uint64_t wal_synced = 0;
+      uint64_t user_bytes_written = 0;
+      uint64_t write_other = 0;
+      uint64_t write_self = 0;
+      uint64_t write_with_wal = 0;
+      uint64_t total_bytes_written = 0;
+      uint64_t total_bytes_read = 0;
+      uint64_t micros_up = env_->NowMicros() - started_at_;
+      // Add "+1" to make sure seconds_up is > 0 and avoid NaN later
+      double seconds_up = (micros_up + 1) / 1000000.0;
+      uint64_t total_slowdown = 0;
+      uint64_t total_slowdown_count = 0;
+      uint64_t interval_bytes_written = 0;
+      uint64_t interval_bytes_read = 0;
+      uint64_t interval_bytes_new = 0;
+      double interval_seconds_up = 0;
+
+      if (statistics_) {
+        wal_bytes = statistics_->getTickerCount(WAL_FILE_BYTES);
+        wal_synced = statistics_->getTickerCount(WAL_FILE_SYNCED);
+        user_bytes_written = statistics_->getTickerCount(BYTES_WRITTEN);
+        write_other = statistics_->getTickerCount(WRITE_DONE_BY_OTHER);
+        write_self = statistics_->getTickerCount(WRITE_DONE_BY_SELF);
+        write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL);
+      }
+
+      snprintf(
+          buf, sizeof(buf),
+          "                               Compactions\n"
+          "Level  Files Size(MB) Score Time(sec)  Read(MB) Write(MB)    Rn(MB) "
+          " "
+          "Rnp1(MB)  Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s)      Rn     "
+          "Rnp1 "
+          "    Wnp1     NewW    Count   msComp   msStall  Ln-stall Stall-cnt\n"
+          "--------------------------------------------------------------------"
+          "--"
+          "--------------------------------------------------------------------"
+          "--"
+          "----------------------------------------------------------------\n");
+      value->append(buf);
+      for (int level = 0; level < number_levels_; level++) {
+        int files = current->NumLevelFiles(level);
+        if (compaction_stats_[level].micros > 0 || files > 0) {
+          int64_t bytes_read = compaction_stats_[level].bytes_readn +
+                               compaction_stats_[level].bytes_readnp1;
+          int64_t bytes_new = compaction_stats_[level].bytes_written -
+                              compaction_stats_[level].bytes_readnp1;
+          double amplify =
+              (compaction_stats_[level].bytes_readn == 0)
+                  ? 0.0
+                  : (compaction_stats_[level].bytes_written +
+                     compaction_stats_[level].bytes_readnp1 +
+                     compaction_stats_[level].bytes_readn) /
+                        (double)compaction_stats_[level].bytes_readn;
+
+          total_bytes_read += bytes_read;
+          total_bytes_written += compaction_stats_[level].bytes_written;
+
+          uint64_t stalls = level == 0 ? (stall_counts_[LEVEL0_SLOWDOWN] +
+                                          stall_counts_[LEVEL0_NUM_FILES] +
+                                          stall_counts_[MEMTABLE_COMPACTION])
+                                       : stall_leveln_slowdown_count_[level];
+
+          double stall_us = level == 0 ? (stall_micros_[LEVEL0_SLOWDOWN] +
+                                          stall_micros_[LEVEL0_NUM_FILES] +
+                                          stall_micros_[MEMTABLE_COMPACTION])
+                                       : stall_leveln_slowdown_[level];
+
+          snprintf(buf, sizeof(buf),
+                   "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f "
+                   "%10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %8d %9.1f %9.1f "
+                   "%9lu\n",
+                   level, files, current->NumLevelBytes(level) / 1048576.0,
+                   current->NumLevelBytes(level) /
+                       cfd->compaction_picker()->MaxBytesForLevel(level),
+                   compaction_stats_[level].micros / 1e6,
+                   bytes_read / 1048576.0,
+                   compaction_stats_[level].bytes_written / 1048576.0,
+                   compaction_stats_[level].bytes_readn / 1048576.0,
+                   compaction_stats_[level].bytes_readnp1 / 1048576.0,
+                   bytes_new / 1048576.0, amplify,
+                   // +1 to avoid division by 0
+                   (bytes_read / 1048576.0) /
+                       ((compaction_stats_[level].micros + 1) / 1000000.0),
+                   (compaction_stats_[level].bytes_written / 1048576.0) /
+                       ((compaction_stats_[level].micros + 1) / 1000000.0),
+                   compaction_stats_[level].files_in_leveln,
+                   compaction_stats_[level].files_in_levelnp1,
+                   compaction_stats_[level].files_out_levelnp1,
+                   compaction_stats_[level].files_out_levelnp1 -
+                       compaction_stats_[level].files_in_levelnp1,
+                   compaction_stats_[level].count,
+                   (int)((double)compaction_stats_[level].micros / 1000.0 /
+                         (compaction_stats_[level].count + 1)),
+                   (double)stall_us / 1000.0 / (stalls + 1),
+                   stall_us / 1000000.0, (unsigned long)stalls);
+          total_slowdown += stall_leveln_slowdown_[level];
+          total_slowdown_count += stall_leveln_slowdown_count_[level];
+          value->append(buf);
+        }
+      }
+
+      interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_;
+      interval_bytes_read =
+          total_bytes_read - last_stats_.compaction_bytes_read_;
+      interval_bytes_written =
+          total_bytes_written - last_stats_.compaction_bytes_written_;
+      interval_seconds_up = seconds_up - last_stats_.seconds_up_;
+
+      snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
+               seconds_up, interval_seconds_up);
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Writes cumulative: %llu total, %llu batches, "
+               "%.1f per batch, %.2f ingest GB\n",
+               (unsigned long long)(write_other + write_self),
+               (unsigned long long)write_self,
+               (write_other + write_self) / (double)(write_self + 1),
+               user_bytes_written / (1048576.0 * 1024));
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "WAL cumulative: %llu WAL writes, %llu WAL syncs, "
+               "%.2f writes per sync, %.2f GB written\n",
+               (unsigned long long)write_with_wal,
+               (unsigned long long)wal_synced,
+               write_with_wal / (double)(wal_synced + 1),
+               wal_bytes / (1048576.0 * 1024));
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Compaction IO cumulative (GB): "
+               "%.2f new, %.2f read, %.2f write, %.2f read+write\n",
+               user_bytes_written / (1048576.0 * 1024),
+               total_bytes_read / (1048576.0 * 1024),
+               total_bytes_written / (1048576.0 * 1024),
+               (total_bytes_read + total_bytes_written) / (1048576.0 * 1024));
+      value->append(buf);
+
+      snprintf(
+          buf, sizeof(buf),
+          "Compaction IO cumulative (MB/sec): "
+          "%.1f new, %.1f read, %.1f write, %.1f read+write\n",
+          user_bytes_written / 1048576.0 / seconds_up,
+          total_bytes_read / 1048576.0 / seconds_up,
+          total_bytes_written / 1048576.0 / seconds_up,
+          (total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up);
+      value->append(buf);
+
+      // +1 to avoid divide by 0 and NaN
+      snprintf(
+          buf, sizeof(buf),
+          "Amplification cumulative: %.1f write, %.1f compaction\n",
+          (double)(total_bytes_written + wal_bytes) / (user_bytes_written + 1),
+          (double)(total_bytes_written + total_bytes_read + wal_bytes) /
+              (user_bytes_written + 1));
+      value->append(buf);
+
+      uint64_t interval_write_other = write_other - last_stats_.write_other_;
+      uint64_t interval_write_self = write_self - last_stats_.write_self_;
+
+      snprintf(buf, sizeof(buf),
+               "Writes interval: %llu total, %llu batches, "
+               "%.1f per batch, %.1f ingest MB\n",
+               (unsigned long long)(interval_write_other + interval_write_self),
+               (unsigned long long)interval_write_self,
+               (double)(interval_write_other + interval_write_self) /
+                   (interval_write_self + 1),
+               (user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0);
+      value->append(buf);
+
+      uint64_t interval_write_with_wal =
+          write_with_wal - last_stats_.write_with_wal_;
+
+      uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_;
+      uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_;
+
+      snprintf(buf, sizeof(buf),
+               "WAL interval: %llu WAL writes, %llu WAL syncs, "
+               "%.2f writes per sync, %.2f MB written\n",
+               (unsigned long long)interval_write_with_wal,
+               (unsigned long long)interval_wal_synced,
+               interval_write_with_wal / (double)(interval_wal_synced + 1),
+               interval_wal_bytes / (1048576.0 * 1024));
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Compaction IO interval (MB): "
+               "%.2f new, %.2f read, %.2f write, %.2f read+write\n",
+               interval_bytes_new / 1048576.0, interval_bytes_read / 1048576.0,
+               interval_bytes_written / 1048576.0,
+               (interval_bytes_read + interval_bytes_written) / 1048576.0);
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Compaction IO interval (MB/sec): "
+               "%.1f new, %.1f read, %.1f write, %.1f read+write\n",
+               interval_bytes_new / 1048576.0 / interval_seconds_up,
+               interval_bytes_read / 1048576.0 / interval_seconds_up,
+               interval_bytes_written / 1048576.0 / interval_seconds_up,
+               (interval_bytes_read + interval_bytes_written) / 1048576.0 /
+                   interval_seconds_up);
+      value->append(buf);
+
+      // +1 to avoid divide by 0 and NaN
+      snprintf(
+          buf, sizeof(buf),
+          "Amplification interval: %.1f write, %.1f compaction\n",
+          (double)(interval_bytes_written + wal_bytes) /
+              (interval_bytes_new + 1),
+          (double)(interval_bytes_written + interval_bytes_read + wal_bytes) /
+              (interval_bytes_new + 1));
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, "
+               "%.3f memtable_compaction, %.3f leveln_slowdown\n",
+               stall_micros_[LEVEL0_SLOWDOWN] / 1000000.0,
+               stall_micros_[LEVEL0_NUM_FILES] / 1000000.0,
+               stall_micros_[MEMTABLE_COMPACTION] / 1000000.0,
+               total_slowdown / 1000000.0);
+      value->append(buf);
+
+      snprintf(buf, sizeof(buf),
+               "Stalls(count): %lu level0_slowdown, %lu level0_numfiles, "
+               "%lu memtable_compaction, %lu leveln_slowdown\n",
+               (unsigned long)stall_counts_[LEVEL0_SLOWDOWN],
+               (unsigned long)stall_counts_[LEVEL0_NUM_FILES],
+               (unsigned long)stall_counts_[MEMTABLE_COMPACTION],
+               (unsigned long)total_slowdown_count);
+      value->append(buf);
+
+      last_stats_.compaction_bytes_read_ = total_bytes_read;
+      last_stats_.compaction_bytes_written_ = total_bytes_written;
+      last_stats_.ingest_bytes_ = user_bytes_written;
+      last_stats_.seconds_up_ = seconds_up;
+      last_stats_.wal_bytes_ = wal_bytes;
+      last_stats_.wal_synced_ = wal_synced;
+      last_stats_.write_with_wal_ = write_with_wal;
+      last_stats_.write_other_ = write_other;
+      last_stats_.write_self_ = write_self;
+
+      return true;
+    }
+    case kSsTables:
+      *value = current->DebugString();
+      return true;
+    case kNumImmutableMemTable:
+      *value = std::to_string(cfd->imm()->size());
+      return true;
+    case kMemtableFlushPending:
+      // Return number of mem tables that are ready to flush (made immutable)
+      *value = std::to_string(cfd->imm()->IsFlushPending() ? 1 : 0);
+      return true;
+    case kCompactionPending:
+      // 1 if the system already determines at least one compacdtion is needed.
+      // 0 otherwise,
+      *value = std::to_string(current->NeedsCompaction() ? 1 : 0);
+      return true;
+    case kBackgroundErrors:
+      // Accumulated number of  errors in background flushes or compactions.
+      *value = std::to_string(GetBackgroundErrorCount());
+      return true;
+    case kCurSizeActiveMemTable:
+      // Current size of the active memtable
+      *value = std::to_string(cfd->mem()->ApproximateMemoryUsage());
+      return true;
+    case kNumEntriesInMutableMemtable:
+      // Current size of the active memtable
+      *value = std::to_string(cfd->mem()->GetNumEntries());
+      return true;
+    case kNumEntriesInImmutableMemtable:
+      // Current size of the active memtable
+      *value = std::to_string(cfd->imm()->current()->GetTotalNumEntries());
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace rocksdb
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -0,0 +1,187 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#pragma once
+#include "rocksdb/statistics.h"
+#include "util/statistics.h"
+#include "db/version_set.h"
+
+#include <vector>
+#include <string>
+
+class ColumnFamilyData;
+
+namespace rocksdb {
+
+class MemTableList;
+class DBImpl;
+
+enum DBPropertyType {
+  kNumFilesAtLevel,  // Number of files at a specific level
+  kLevelStats,       // Return number of files and total sizes of each level
+  kStats,            // Return general statitistics of DB
+  kSsTables,         // Return a human readable string of current SST files
+  kNumImmutableMemTable,   // Return number of immutable mem tables
+  kMemtableFlushPending,   // Return 1 if mem table flushing is pending,
+                           // otherwise 0.
+  kCompactionPending,      // Return 1 if a compaction is pending. Otherwise 0.
+  kBackgroundErrors,       // Return accumulated background errors encountered.
+  kCurSizeActiveMemTable,  // Return current size of the active memtable
+  kNumEntriesInMutableMemtable,    // Return number of entries in the mutable
+                                   // memtable.
+  kNumEntriesInImmutableMemtable,  // Return sum of number of entries in all
+                                   // the immutable mem tables.
+  kUnknown,
+};
+
+extern DBPropertyType GetPropertyType(const Slice& property);
+
+class InternalStats {
+ public:
+  enum WriteStallType {
+    LEVEL0_SLOWDOWN,
+    MEMTABLE_COMPACTION,
+    LEVEL0_NUM_FILES,
+    WRITE_STALLS_ENUM_MAX,
+  };
+
+  InternalStats(int num_levels, Env* env, Statistics* statistics)
+      : compaction_stats_(num_levels),
+        stall_micros_(WRITE_STALLS_ENUM_MAX, 0),
+        stall_counts_(WRITE_STALLS_ENUM_MAX, 0),
+        stall_leveln_slowdown_(num_levels, 0),
+        stall_leveln_slowdown_count_(num_levels, 0),
+        bg_error_count_(0),
+        number_levels_(num_levels),
+        statistics_(statistics),
+        env_(env),
+        started_at_(env->NowMicros()) {}
+
+  // Per level compaction stats.  compaction_stats_[level] stores the stats for
+  // compactions that produced data for the specified "level".
+  struct CompactionStats {
+    uint64_t micros;
+
+    // Bytes read from level N during compaction between levels N and N+1
+    int64_t bytes_readn;
+
+    // Bytes read from level N+1 during compaction between levels N and N+1
+    int64_t bytes_readnp1;
+
+    // Total bytes written during compaction between levels N and N+1
+    int64_t bytes_written;
+
+    // Files read from level N during compaction between levels N and N+1
+    int files_in_leveln;
+
+    // Files read from level N+1 during compaction between levels N and N+1
+    int files_in_levelnp1;
+
+    // Files written during compaction between levels N and N+1
+    int files_out_levelnp1;
+
+    // Number of compactions done
+    int count;
+
+    CompactionStats()
+        : micros(0),
+          bytes_readn(0),
+          bytes_readnp1(0),
+          bytes_written(0),
+          files_in_leveln(0),
+          files_in_levelnp1(0),
+          files_out_levelnp1(0),
+          count(0) {}
+
+    void Add(const CompactionStats& c) {
+      this->micros += c.micros;
+      this->bytes_readn += c.bytes_readn;
+      this->bytes_readnp1 += c.bytes_readnp1;
+      this->bytes_written += c.bytes_written;
+      this->files_in_leveln += c.files_in_leveln;
+      this->files_in_levelnp1 += c.files_in_levelnp1;
+      this->files_out_levelnp1 += c.files_out_levelnp1;
+      this->count += 1;
+    }
+  };
+
+  void AddCompactionStats(int level, const CompactionStats& stats) {
+    compaction_stats_[level].Add(stats);
+  }
+
+  void RecordWriteStall(WriteStallType write_stall_type, uint64_t micros) {
+    stall_micros_[write_stall_type] += micros;
+    stall_counts_[write_stall_type]++;
+  }
+
+  void RecordLevelNSlowdown(int level, uint64_t micros) {
+    stall_leveln_slowdown_[level] += micros;
+    stall_leveln_slowdown_count_[level] += micros;
+  }
+
+  uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
+
+  bool GetProperty(DBPropertyType property_type, const Slice& property,
+                   std::string* value, ColumnFamilyData* cfd);
+
+ private:
+  std::vector<CompactionStats> compaction_stats_;
+
+  // Used to compute per-interval statistics
+  struct StatsSnapshot {
+    uint64_t compaction_bytes_read_;     // Bytes read by compaction
+    uint64_t compaction_bytes_written_;  // Bytes written by compaction
+    uint64_t ingest_bytes_;              // Bytes written by user
+    uint64_t wal_bytes_;                 // Bytes written to WAL
+    uint64_t wal_synced_;                // Number of times WAL is synced
+    uint64_t write_with_wal_;            // Number of writes that request WAL
+    // These count the number of writes processed by the calling thread or
+    // another thread.
+    uint64_t write_other_;
+    uint64_t write_self_;
+    double seconds_up_;
+
+    StatsSnapshot()
+        : compaction_bytes_read_(0),
+          compaction_bytes_written_(0),
+          ingest_bytes_(0),
+          wal_bytes_(0),
+          wal_synced_(0),
+          write_with_wal_(0),
+          write_other_(0),
+          write_self_(0),
+          seconds_up_(0) {}
+  };
+
+  // Counters from the previous time per-interval stats were computed
+  StatsSnapshot last_stats_;
+
+  // These count the number of microseconds for which MakeRoomForWrite stalls.
+  std::vector<uint64_t> stall_micros_;
+  std::vector<uint64_t> stall_counts_;
+  std::vector<uint64_t> stall_leveln_slowdown_;
+  std::vector<uint64_t> stall_leveln_slowdown_count_;
+
+  // Total number of background errors encountered. Every time a flush task
+  // or compaction task fails, this counter is incremented. The failure can
+  // be caused by any possible reason, including file system errors, out of
+  // resources, or input file corruption. Failing when retrying the same flush
+  // or compaction will cause the counter to increase too.
+  uint64_t bg_error_count_;
+
+  int number_levels_;
+  Statistics* statistics_;
+  Env* env_;
+  uint64_t started_at_;
+};
+
+}  // namespace rocksdb
--- a/db/log_and_apply_bench.cc
+++ b/db/log_and_apply_bench.cc
@@ -0,0 +1,79 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+
+#include <vector>
+
+#include "util/testharness.h"
+#include "util/benchharness.h"
+#include "db/version_set.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+std::string MakeKey(unsigned int num) {
+  char buf[30];
+  snprintf(buf, sizeof(buf), "%016u", num);
+  return std::string(buf);
+}
+
+void BM_LogAndApply(int iters, int num_base_files) {
+  VersionSet* vset;
+  ColumnFamilyData* default_cfd;
+  uint64_t fnum = 1;
+  port::Mutex mu;
+  MutexLock l(&mu);
+
+  BENCHMARK_SUSPEND {
+    std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark";
+    ASSERT_OK(DestroyDB(dbname, Options()));
+
+    DB* db = nullptr;
+    Options opts;
+    opts.create_if_missing = true;
+    Status s = DB::Open(opts, dbname, &db);
+    ASSERT_OK(s);
+    ASSERT_TRUE(db != nullptr);
+
+    delete db;
+    db = nullptr;
+
+    Options options;
+    EnvOptions sopt;
+    vset = new VersionSet(dbname, &options, sopt, nullptr);
+    std::vector<ColumnFamilyDescriptor> dummy;
+    dummy.push_back(ColumnFamilyDescriptor());
+    ASSERT_OK(vset->Recover(dummy));
+    default_cfd = vset->GetColumnFamilySet()->GetDefault();
+    VersionEdit vbase;
+    for (int i = 0; i < num_base_files; i++) {
+      InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
+      InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
+      vbase.AddFile(2, ++fnum, 1 /* file size */, start, limit, 1, 1);
+    }
+    ASSERT_OK(vset->LogAndApply(default_cfd, &vbase, &mu));
+  }
+
+  for (int i = 0; i < iters; i++) {
+    VersionEdit vedit;
+    vedit.DeleteFile(2, fnum);
+    InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
+    InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
+    vedit.AddFile(2, ++fnum, 1 /* file size */, start, limit, 1, 1);
+    vset->LogAndApply(default_cfd, &vedit, &mu);
+  }
+}
+
+BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1)
+BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_100_files, 1000, 100)
+BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_10000_files, 1000, 10000)
+BENCHMARK_NAMED_PARAM(BM_LogAndApply, 100_iters_100000_files, 100, 100000)
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::benchmark::RunBenchmarks();
+  return 0;
+}
--- a/db/log_format.h
+++ b/db/log_format.h
@@ -0,0 +1,35 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#pragma once
+namespace rocksdb {
+namespace log {
+
+enum RecordType {
+  // Zero is reserved for preallocated files
+  kZeroType = 0,
+  kFullType = 1,
+
+  // For fragments
+  kFirstType = 2,
+  kMiddleType = 3,
+  kLastType = 4
+};
+static const int kMaxRecordType = kLastType;
+
+static const unsigned int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
+static const int kHeaderSize = 4 + 1 + 2;
+
+}  // namespace log
+}  // namespace rocksdb
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -0,0 +1,339 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace rocksdb {
+namespace log {
+
+Reader::Reporter::~Reporter() {
+}
+
+Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+               bool checksum, uint64_t initial_offset)
+    : file_(std::move(file)),
+      reporter_(reporter),
+      checksum_(checksum),
+      backing_store_(new char[kBlockSize]),
+      buffer_(),
+      eof_(false),
+      read_error_(false),
+      eof_offset_(0),
+      last_record_offset_(0),
+      end_of_buffer_offset_(0),
+      initial_offset_(initial_offset) {
+}
+
+Reader::~Reader() {
+  delete[] backing_store_;
+}
+
+bool Reader::SkipToInitialBlock() {
+  size_t offset_in_block = initial_offset_ % kBlockSize;
+  uint64_t block_start_location = initial_offset_ - offset_in_block;
+
+  // Don't search a block if we'd be in the trailer
+  if (offset_in_block > kBlockSize - 6) {
+    offset_in_block = 0;
+    block_start_location += kBlockSize;
+  }
+
+  end_of_buffer_offset_ = block_start_location;
+
+  // Skip to start of first block that can contain the initial record
+  if (block_start_location > 0) {
+    Status skip_status = file_->Skip(block_start_location);
+    if (!skip_status.ok()) {
+      ReportDrop(block_start_location, skip_status);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+  if (last_record_offset_ < initial_offset_) {
+    if (!SkipToInitialBlock()) {
+      return false;
+    }
+  }
+
+  scratch->clear();
+  record->clear();
+  bool in_fragmented_record = false;
+  // Record offset of the logical record that we're reading
+  // 0 is a dummy value to make compilers happy
+  uint64_t prospective_record_offset = 0;
+
+  Slice fragment;
+  while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+    const unsigned int record_type = ReadPhysicalRecord(&fragment);
+    switch (record_type) {
+      case kFullType:
+        if (in_fragmented_record) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(1)");
+          }
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->clear();
+        *record = fragment;
+        last_record_offset_ = prospective_record_offset;
+        return true;
+
+      case kFirstType:
+        if (in_fragmented_record) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(2)");
+          }
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->assign(fragment.data(), fragment.size());
+        in_fragmented_record = true;
+        break;
+
+      case kMiddleType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          return true;
+        }
+        break;
+
+      case kEof:
+        if (in_fragmented_record) {
+          // This can be caused by the writer dying immediately after
+          //  writing a physical record but before completing the next; don't
+          //  treat it as a corruption, just ignore the entire logical record.
+          scratch->clear();
+        }
+        return false;
+
+      case kBadRecord:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+        ReportCorruption(
+            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+            buf);
+        in_fragmented_record = false;
+        scratch->clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+uint64_t Reader::LastRecordOffset() {
+  return last_record_offset_;
+}
+
+void Reader::UnmarkEOF() {
+  if (read_error_) {
+    return;
+  }
+
+  eof_ = false;
+
+  if (eof_offset_ == 0) {
+    return;
+  }
+
+  // If the EOF was in the middle of a block (a partial block was read) we have
+  // to read the rest of the block as ReadPhysicalRecord can only read full
+  // blocks and expects the file position indicator to be aligned to the start
+  // of a block.
+  //
+  //      consumed_bytes + buffer_size() + remaining == kBlockSize
+
+  size_t consumed_bytes = eof_offset_ - buffer_.size();
+  size_t remaining = kBlockSize - eof_offset_;
+
+  // backing_store_ is used to concatenate what is left in buffer_ and
+  // the remainder of the block. If buffer_ already uses backing_store_,
+  // we just append the new data.
+  if (buffer_.data() != backing_store_ + consumed_bytes) {
+    // Buffer_ does not use backing_store_ for storage.
+    // Copy what is left in buffer_ to backing_store.
+    memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size());
+  }
+
+  Slice read_buffer;
+  Status status = file_->Read(remaining, &read_buffer,
+    backing_store_ + eof_offset_);
+
+  size_t added = read_buffer.size();
+  end_of_buffer_offset_ += added;
+
+  if (!status.ok()) {
+    if (added > 0) {
+      ReportDrop(added, status);
+    }
+
+    read_error_ = true;
+    return;
+  }
+
+  if (read_buffer.data() != backing_store_ + eof_offset_) {
+    // Read did not write to backing_store_
+    memmove(backing_store_ + eof_offset_, read_buffer.data(),
+      read_buffer.size());
+  }
+
+  buffer_ = Slice(backing_store_ + consumed_bytes,
+    eof_offset_ + added - consumed_bytes);
+
+  if (added < remaining) {
+    eof_ = true;
+    eof_offset_ += added;
+  } else {
+    eof_offset_ = 0;
+  }
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+  if (reporter_ != nullptr &&
+      end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
+    reporter_->Corruption(bytes, reason);
+  }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result) {
+  while (true) {
+    if (buffer_.size() < (size_t)kHeaderSize) {
+      if (!eof_ && !read_error_) {
+        // Last read was a full read, so this is a trailer to skip
+        buffer_.clear();
+        Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+        end_of_buffer_offset_ += buffer_.size();
+        if (!status.ok()) {
+          buffer_.clear();
+          ReportDrop(kBlockSize, status);
+          read_error_ = true;
+          return kEof;
+        } else if (buffer_.size() < (size_t)kBlockSize) {
+          eof_ = true;
+          eof_offset_ = buffer_.size();
+        }
+        continue;
+      } else {
+        // Note that if buffer_ is non-empty, we have a truncated header at the
+        //  end of the file, which can be caused by the writer crashing in the
+        //  middle of writing the header. Instead of considering this an error,
+        //  just report EOF.
+        buffer_.clear();
+        return kEof;
+      }
+    }
+
+    // Parse the header
+    const char* header = buffer_.data();
+    const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+    const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+    const unsigned int type = header[6];
+    const uint32_t length = a | (b << 8);
+    if (kHeaderSize + length > buffer_.size()) {
+      size_t drop_size = buffer_.size();
+      buffer_.clear();
+      if (!eof_) {
+        ReportCorruption(drop_size, "bad record length");
+        return kBadRecord;
+      }
+      // If the end of the file has been reached without reading |length| bytes
+      // of payload, assume the writer died in the middle of writing the record.
+      // Don't report a corruption.
+      return kEof;
+    }
+
+    if (type == kZeroType && length == 0) {
+      // Skip zero length record without reporting any drops since
+      // such records are produced by the mmap based writing code in
+      // env_posix.cc that preallocates file regions.
+      // NOTE: this should never happen in DB written by new RocksDB versions,
+      // since we turn off mmap writes to manifest and log files
+      buffer_.clear();
+      return kBadRecord;
+    }
+
+    // Check crc
+    if (checksum_) {
+      uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+      uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
+      if (actual_crc != expected_crc) {
+        // Drop the rest of the buffer since "length" itself may have
+        // been corrupted and if we trust it, we could find some
+        // fragment of a real log record that just happens to look
+        // like a valid log record.
+        size_t drop_size = buffer_.size();
+        buffer_.clear();
+        ReportCorruption(drop_size, "checksum mismatch");
+        return kBadRecord;
+      }
+    }
+
+    buffer_.remove_prefix(kHeaderSize + length);
+
+    // Skip physical record that started before initial_offset_
+    if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
+        initial_offset_) {
+      result->clear();
+      return kBadRecord;
+    }
+
+    *result = Slice(header + kHeaderSize, length);
+    return type;
+  }
+}
+
+}  // namespace log
+}  // namespace rocksdb
--- a/db/log_reader.h
+++ b/db/log_reader.h
@@ -0,0 +1,130 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "db/log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class SequentialFile;
+using std::unique_ptr;
+
+namespace log {
+
+class Reader {
+ public:
+  // Interface for reporting errors.
+  class Reporter {
+   public:
+    virtual ~Reporter();
+
+    // Some corruption was detected.  "size" is the approximate number
+    // of bytes dropped due to the corruption.
+    virtual void Corruption(size_t bytes, const Status& status) = 0;
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  //
+  // If "reporter" is non-nullptr, it is notified whenever some data is
+  // dropped due to a detected corruption.  "*reporter" must remain
+  // live while this Reader is in use.
+  //
+  // If "checksum" is true, verify checksums if available.
+  //
+  // The Reader will start reading at the first record located at physical
+  // position >= initial_offset within the file.
+  Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+         bool checksum, uint64_t initial_offset);
+
+  ~Reader();
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input.  May use
+  // "*scratch" as temporary storage.  The contents filled in *record
+  // will only be valid until the next mutating operation on this
+  // reader or the next mutation to *scratch.
+  bool ReadRecord(Slice* record, std::string* scratch);
+
+  // Returns the physical offset of the last record returned by ReadRecord.
+  //
+  // Undefined before the first call to ReadRecord.
+  uint64_t LastRecordOffset();
+
+  // returns true if the reader has encountered an eof condition.
+  bool IsEOF() {
+    return eof_;
+  }
+
+  // when we know more data has been written to the file. we can use this
+  // function to force the reader to look again in the file.
+  // Also aligns the file position indicator to the start of the next block
+  // by reading the rest of the data from the EOF position to the end of the
+  // block that was partially read.
+  void UnmarkEOF();
+
+  SequentialFile* file() { return file_.get(); }
+
+ private:
+  const unique_ptr<SequentialFile> file_;
+  Reporter* const reporter_;
+  bool const checksum_;
+  char* const backing_store_;
+  Slice buffer_;
+  bool eof_;   // Last Read() indicated EOF by returning < kBlockSize
+  bool read_error_;   // Error occurred while reading from file
+
+  // Offset of the file position indicator within the last block when an
+  // EOF was detected.
+  size_t eof_offset_;
+
+  // Offset of the last record returned by ReadRecord.
+  uint64_t last_record_offset_;
+  // Offset of the first location past the end of buffer_.
+  uint64_t end_of_buffer_offset_;
+
+  // Offset at which to start looking for the first record to return
+  uint64_t const initial_offset_;
+
+  // Extend record types with the following special values
+  enum {
+    kEof = kMaxRecordType + 1,
+    // Returned whenever we find an invalid physical record.
+    // Currently there are three situations in which this happens:
+    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+    // * The record is a 0-length record (No drop is reported)
+    // * The record is below constructor's initial_offset (No drop is reported)
+    kBadRecord = kMaxRecordType + 2
+  };
+
+  // Skips all blocks that are completely before "initial_offset_".
+  //
+  // Returns true on success. Handles reporting.
+  bool SkipToInitialBlock();
+
+  // Return type, or one of the preceding special values
+  unsigned int ReadPhysicalRecord(Slice* result);
+
+  // Reports dropped bytes to the reporter.
+  // buffer_ must be updated to remove the dropped bytes prior to invocation.
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);
+
+  // No copying allowed
+  Reader(const Reader&);
+  void operator=(const Reader&);
+};
+
+}  // namespace log
+}  // namespace rocksdb
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -0,0 +1,689 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
+  while (result.size() < n) {
+    result.append(partial_string);
+  }
+  result.resize(n);
+  return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%d.", n);
+  return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+  return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+class LogTest {
+ private:
+  class StringDest : public WritableFile {
+   public:
+    std::string contents_;
+
+    explicit StringDest(Slice& reader_contents) :
+      WritableFile(),
+      contents_(""),
+      reader_contents_(reader_contents),
+      last_flush_(0) {
+      reader_contents_ = Slice(contents_.data(), 0);
+    };
+
+    virtual Status Close() { return Status::OK(); }
+    virtual Status Flush() {
+      ASSERT_TRUE(reader_contents_.size() <= last_flush_);
+      size_t offset = last_flush_ - reader_contents_.size();
+      reader_contents_ = Slice(
+          contents_.data() + offset,
+          contents_.size() - offset);
+      last_flush_ = contents_.size();
+
+      return Status::OK();
+    }
+    virtual Status Sync() { return Status::OK(); }
+    virtual Status Append(const Slice& slice) {
+      contents_.append(slice.data(), slice.size());
+      return Status::OK();
+    }
+    void Drop(size_t bytes) {
+      contents_.resize(contents_.size() - bytes);
+      reader_contents_ = Slice(
+          reader_contents_.data(), reader_contents_.size() - bytes);
+      last_flush_ = contents_.size();
+    }
+
+   private:
+    Slice& reader_contents_;
+    size_t last_flush_;
+  };
+
+  class StringSource : public SequentialFile {
+   public:
+    Slice& contents_;
+    bool force_error_;
+    size_t force_error_position_;
+    bool force_eof_;
+    size_t force_eof_position_;
+    bool returned_partial_;
+    explicit StringSource(Slice& contents) :
+      contents_(contents),
+      force_error_(false),
+      force_error_position_(0),
+      force_eof_(false),
+      force_eof_position_(0),
+      returned_partial_(false) { }
+
+    virtual Status Read(size_t n, Slice* result, char* scratch) {
+      ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+
+      if (force_error_) {
+        if (force_error_position_ >= n) {
+          force_error_position_ -= n;
+        } else {
+          *result = Slice(contents_.data(), force_error_position_);
+          contents_.remove_prefix(force_error_position_);
+          force_error_ = false;
+          returned_partial_ = true;
+          return Status::Corruption("read error");
+        }
+      }
+
+      if (contents_.size() < n) {
+        n = contents_.size();
+        returned_partial_ = true;
+      }
+
+      if (force_eof_) {
+        if (force_eof_position_ >= n) {
+          force_eof_position_ -= n;
+        } else {
+          force_eof_ = false;
+          n = force_eof_position_;
+          returned_partial_ = true;
+        }
+      }
+
+      // By using scratch we ensure that caller has control over the
+      // lifetime of result.data()
+      memcpy(scratch, contents_.data(), n);
+      *result = Slice(scratch, n);
+
+      contents_.remove_prefix(n);
+      return Status::OK();
+    }
+
+    virtual Status Skip(uint64_t n) {
+      if (n > contents_.size()) {
+        contents_.clear();
+        return Status::NotFound("in-memory file skipepd past end");
+      }
+
+      contents_.remove_prefix(n);
+
+      return Status::OK();
+    }
+  };
+
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) { }
+    virtual void Corruption(size_t bytes, const Status& status) {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  std::string& dest_contents() {
+    auto dest = dynamic_cast<StringDest*>(writer_.file());
+    assert(dest);
+    return dest->contents_;
+  }
+
+  const std::string& dest_contents() const {
+    auto dest = dynamic_cast<const StringDest*>(writer_.file());
+    assert(dest);
+    return dest->contents_;
+  }
+
+  void reset_source_contents() {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    assert(src);
+    src->contents_ = dest_contents();
+  }
+
+  Slice reader_contents_;
+  unique_ptr<StringDest> dest_holder_;
+  unique_ptr<StringSource> source_holder_;
+  ReportCollector report_;
+  Writer writer_;
+  Reader reader_;
+
+  // Record metadata for testing initial offset functionality
+  static size_t initial_offset_record_sizes_[];
+  static uint64_t initial_offset_last_record_offsets_[];
+
+ public:
+  LogTest() : reader_contents_(),
+              dest_holder_(new StringDest(reader_contents_)),
+              source_holder_(new StringSource(reader_contents_)),
+              writer_(std::move(dest_holder_)),
+              reader_(std::move(source_holder_), &report_, true/*checksum*/,
+                      0/*initial_offset*/) {
+  }
+
+  void Write(const std::string& msg) {
+    writer_.AddRecord(Slice(msg));
+  }
+
+  size_t WrittenBytes() const {
+    return dest_contents().size();
+  }
+
+  std::string Read() {
+    std::string scratch;
+    Slice record;
+    if (reader_.ReadRecord(&record, &scratch)) {
+      return record.ToString();
+    } else {
+      return "EOF";
+    }
+  }
+
+  void IncrementByte(int offset, int delta) {
+    dest_contents()[offset] += delta;
+  }
+
+  void SetByte(int offset, char new_byte) {
+    dest_contents()[offset] = new_byte;
+  }
+
+  void ShrinkSize(int bytes) {
+    auto dest = dynamic_cast<StringDest*>(writer_.file());
+    assert(dest);
+    dest->Drop(bytes);
+  }
+
+  void FixChecksum(int header_offset, int len) {
+    // Compute crc of type/len/data
+    uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len);
+    crc = crc32c::Mask(crc);
+    EncodeFixed32(&dest_contents()[header_offset], crc);
+  }
+
+  void ForceError(size_t position = 0) {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    src->force_error_ = true;
+    src->force_error_position_ = position;
+  }
+
+  size_t DroppedBytes() const {
+    return report_.dropped_bytes_;
+  }
+
+  std::string ReportMessage() const {
+    return report_.message_;
+  }
+
+  void ForceEOF(size_t position = 0) {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    src->force_eof_ = true;
+    src->force_eof_position_ = position;
+  }
+
+  void UnmarkEOF() {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    src->returned_partial_ = false;
+    reader_.UnmarkEOF();
+  }
+
+  bool IsEOF() {
+    return reader_.IsEOF();
+  }
+
+  // Returns OK iff recorded error message contains "msg"
+  std::string MatchError(const std::string& msg) const {
+    if (report_.message_.find(msg) == std::string::npos) {
+      return report_.message_;
+    } else {
+      return "OK";
+    }
+  }
+
+  void WriteInitialOffsetLog() {
+    for (int i = 0; i < 4; i++) {
+      std::string record(initial_offset_record_sizes_[i],
+                         static_cast<char>('a' + i));
+      Write(record);
+    }
+  }
+
+  void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
+    WriteInitialOffsetLog();
+    unique_ptr<StringSource> source(new StringSource(reader_contents_));
+    unique_ptr<Reader> offset_reader(
+      new Reader(std::move(source), &report_, true/*checksum*/,
+                 WrittenBytes() + offset_past_end));
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
+  }
+
+  void CheckInitialOffsetRecord(uint64_t initial_offset,
+                                int expected_record_offset) {
+    WriteInitialOffsetLog();
+    unique_ptr<StringSource> source(new StringSource(reader_contents_));
+    unique_ptr<Reader> offset_reader(
+      new Reader(std::move(source), &report_, true/*checksum*/,
+                 initial_offset));
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
+    ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
+              record.size());
+    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
+              offset_reader->LastRecordOffset());
+    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
+  }
+
+};
+
+size_t LogTest::initial_offset_record_sizes_[] =
+    {10000,  // Two sizable records in first block
+     10000,
+     2 * log::kBlockSize - 1000,  // Span three blocks
+     1};
+
+uint64_t LogTest::initial_offset_last_record_offsets_[] =
+    {0,
+     kHeaderSize + 10000,
+     2 * (kHeaderSize + 10000),
+     2 * (kHeaderSize + 10000) +
+         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
+
+
+TEST(LogTest, Empty) {
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ReadWrite) {
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST(LogTest, ManyBlocks) {
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, Fragmentation) {
+  Write("small");
+  Write(BigString("medium", 50000));
+  Write(BigString("large", 100000));
+  ASSERT_EQ("small", Read());
+  ASSERT_EQ(BigString("medium", 50000), Read());
+  ASSERT_EQ(BigString("large", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer2) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, ShortTrailer) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, AlignedEof) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, RandomRead) {
+  const int N = 500;
+  Random write_rnd(301);
+  for (int i = 0; i < N; i++) {
+    Write(RandomSkewedString(i, &write_rnd));
+  }
+  Random read_rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST(LogTest, ReadError) {
+  Write("foo");
+  ForceError();
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST(LogTest, BadRecordType) {
+  Write("foo");
+  // Type is stored in header[6]
+  IncrementByte(6, 100);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
+  Write("foo");
+  ShrinkSize(4);   // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read());
+  // Truncated last record is ignored, not treated as an error
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, BadLength) {
+  const int kPayloadSize = kBlockSize - kHeaderSize;
+  Write(BigString("bar", kPayloadSize));
+  Write("foo");
+  // Least significant size byte is stored in header[4].
+  IncrementByte(4, 1);
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ(kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("bad record length"));
+}
+
+TEST(LogTest, BadLengthAtEndIsIgnored) {
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, ChecksumMismatch) {
+  Write("foo");
+  IncrementByte(0, 10);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(10U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("checksum mismatch"));
+}
+
+TEST(LogTest, UnexpectedMiddleType) {
+  Write("foo");
+  SetByte(6, kMiddleType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedLastType) {
+  Write("foo");
+  SetByte(6, kLastType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedFullType) {
+  Write("foo");
+  Write("bar");
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, UnexpectedFirstType) {
+  Write("foo");
+  Write(BigString("bar", 100000));
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ(BigString("bar", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, MissingLastIsIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Remove the LAST block, including header.
+  ShrinkSize(14);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST(LogTest, PartialLastIsIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Cause a bad record length in the LAST block.
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST(LogTest, ErrorJoinsRecords) {
+  // Consider two fragmented records:
+  //    first(R1) last(R1) first(R2) last(R2)
+  // where the middle two fragments disappear.  We do not want
+  // first(R1),last(R2) to get joined and returned as a valid record.
+
+  // Write records that span two blocks
+  Write(BigString("foo", kBlockSize));
+  Write(BigString("bar", kBlockSize));
+  Write("correct");
+
+  // Wipe the middle block
+  for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
+    SetByte(offset, 'x');
+  }
+
+  ASSERT_EQ("correct", Read());
+  ASSERT_EQ("EOF", Read());
+  const unsigned int dropped = DroppedBytes();
+  ASSERT_LE(dropped, 2*kBlockSize + 100);
+  ASSERT_GE(dropped, 2*kBlockSize);
+}
+
+TEST(LogTest, ReadStart) {
+  CheckInitialOffsetRecord(0, 0);
+}
+
+TEST(LogTest, ReadSecondOneOff) {
+  CheckInitialOffsetRecord(1, 1);
+}
+
+TEST(LogTest, ReadSecondTenThousand) {
+  CheckInitialOffsetRecord(10000, 1);
+}
+
+TEST(LogTest, ReadSecondStart) {
+  CheckInitialOffsetRecord(10007, 1);
+}
+
+TEST(LogTest, ReadThirdOneOff) {
+  CheckInitialOffsetRecord(10008, 2);
+}
+
+TEST(LogTest, ReadThirdStart) {
+  CheckInitialOffsetRecord(20014, 2);
+}
+
+TEST(LogTest, ReadFourthOneOff) {
+  CheckInitialOffsetRecord(20015, 3);
+}
+
+TEST(LogTest, ReadFourthFirstBlockTrailer) {
+  CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
+}
+
+TEST(LogTest, ReadFourthMiddleBlock) {
+  CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthLastBlock) {
+  CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthStart) {
+  CheckInitialOffsetRecord(
+      2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
+      3);
+}
+
+TEST(LogTest, ReadEnd) {
+  CheckOffsetPastEndReturnsNoRecords(0);
+}
+
+TEST(LogTest, ReadPastEnd) {
+  CheckOffsetPastEndReturnsNoRecords(5);
+}
+
+TEST(LogTest, ClearEofSingleBlock) {
+  Write("foo");
+  Write("bar");
+  ForceEOF(3 + kHeaderSize + 2);
+  ASSERT_EQ("foo", Read());
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_TRUE(IsEOF());
+  ASSERT_EQ("EOF", Read());
+  Write("xxx");
+  UnmarkEOF();
+  ASSERT_EQ("xxx", Read());
+  ASSERT_TRUE(IsEOF());
+}
+
+TEST(LogTest, ClearEofMultiBlock) {
+  size_t num_full_blocks = 5;
+  size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25;
+  Write(BigString("foo", n));
+  Write(BigString("bar", n));
+  ForceEOF(n + num_full_blocks * kHeaderSize + 10);
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_TRUE(IsEOF());
+  UnmarkEOF();
+  ASSERT_EQ(BigString("bar", n), Read());
+  ASSERT_TRUE(IsEOF());
+  Write(BigString("xxx", n));
+  UnmarkEOF();
+  ASSERT_EQ(BigString("xxx", n), Read());
+  ASSERT_TRUE(IsEOF());
+}
+
+TEST(LogTest, ClearEofError) {
+  // If an error occurs during Read() in UnmarkEOF(), the records contained
+  // in the buffer should be returned on subsequent calls of ReadRecord()
+  // until no more full records are left, whereafter ReadRecord() should return
+  // false to indicate that it cannot read any further.
+
+  Write("foo");
+  Write("bar");
+  UnmarkEOF();
+  ASSERT_EQ("foo", Read());
+  ASSERT_TRUE(IsEOF());
+  Write("xxx");
+  ForceError(0);
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ClearEofError2) {
+  Write("foo");
+  Write("bar");
+  UnmarkEOF();
+  ASSERT_EQ("foo", Read());
+  Write("xxx");
+  ForceError(3);
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+}  // namespace log
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace rocksdb {
+namespace log {
+
+Writer::Writer(unique_ptr<WritableFile>&& dest)
+    : dest_(std::move(dest)),
+      block_offset_(0) {
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
+}
+
+Writer::~Writer() {
+}
+
+Status Writer::AddRecord(const Slice& slice) {
+  const char* ptr = slice.data();
+  size_t left = slice.size();
+
+  // Fragment the record if necessary and emit it.  Note that if slice
+  // is empty, we still want to iterate once to emit a single
+  // zero-length record
+  Status s;
+  bool begin = true;
+  do {
+    const int leftover = kBlockSize - block_offset_;
+    assert(leftover >= 0);
+    if (leftover < kHeaderSize) {
+      // Switch to a new block
+      if (leftover > 0) {
+        // Fill the trailer (literal below relies on kHeaderSize being 7)
+        assert(kHeaderSize == 7);
+        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
+      }
+      block_offset_ = 0;
+    }
+
+    // Invariant: we never leave < kHeaderSize bytes in a block.
+    assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
+
+    const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
+    const size_t fragment_length = (left < avail) ? left : avail;
+
+    RecordType type;
+    const bool end = (left == fragment_length);
+    if (begin && end) {
+      type = kFullType;
+    } else if (begin) {
+      type = kFirstType;
+    } else if (end) {
+      type = kLastType;
+    } else {
+      type = kMiddleType;
+    }
+
+    s = EmitPhysicalRecord(type, ptr, fragment_length);
+    ptr += fragment_length;
+    left -= fragment_length;
+    begin = false;
+  } while (s.ok() && left > 0);
+  return s;
+}
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+  assert(n <= 0xffff);  // Must fit in two bytes
+  assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+
+  // Format the header
+  char buf[kHeaderSize];
+  buf[4] = static_cast<char>(n & 0xff);
+  buf[5] = static_cast<char>(n >> 8);
+  buf[6] = static_cast<char>(t);
+
+  // Compute the crc of the record type and the payload.
+  uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
+  crc = crc32c::Mask(crc);                 // Adjust for storage
+  EncodeFixed32(buf, crc);
+
+  // Write the header and the payload
+  Status s = dest_->Append(Slice(buf, kHeaderSize));
+  if (s.ok()) {
+    s = dest_->Append(Slice(ptr, n));
+    if (s.ok()) {
+      s = dest_->Flush();
+    }
+  }
+  block_offset_ += kHeaderSize + n;
+  return s;
+}
+
+}  // namespace log
+}  // namespace rocksdb
--- a/db/log_writer.h
+++ b/db/log_writer.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+#include "db/log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class WritableFile;
+
+using std::unique_ptr;
+
+namespace log {
+
+class Writer {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit Writer(unique_ptr<WritableFile>&& dest);
+  ~Writer();
+
+  Status AddRecord(const Slice& slice);
+
+  WritableFile* file() { return dest_.get(); }
+  const WritableFile* file() const { return dest_.get(); }
+
+ private:
+  unique_ptr<WritableFile> dest_;
+  int block_offset_;       // Current offset in block
+
+  // crc32c values for all supported record types.  These are
+  // pre-computed to reduce the overhead of computing the crc of the
+  // record type stored in the header.
+  uint32_t type_crc_[kMaxRecordType + 1];
+
+  Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+  // No copying allowed
+  Writer(const Writer&);
+  void operator=(const Writer&);
+};
+
+}  // namespace log
+}  // namespace rocksdb
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -0,0 +1,620 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+
+#include <memory>
+#include <algorithm>
+#include <limits>
+
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merger.h"
+#include "util/arena.h"
+#include "util/coding.h"
+#include "util/murmurhash.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+#include "util/statistics.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
+    : comparator_(cmp),
+      refs_(0),
+      kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
+      kWriteBufferSize(options.write_buffer_size),
+      arena_(options.arena_block_size),
+      table_(options.memtable_factory->CreateMemTableRep(
+          comparator_, &arena_, options.prefix_extractor.get(),
+          options.info_log.get())),
+      num_entries_(0),
+      flush_in_progress_(false),
+      flush_completed_(false),
+      file_number_(0),
+      first_seqno_(0),
+      mem_next_logfile_number_(0),
+      locks_(options.inplace_update_support ? options.inplace_update_num_locks
+                                            : 0),
+      prefix_extractor_(options.prefix_extractor.get()),
+      should_flush_(ShouldFlushNow()) {
+  // if should_flush_ == true without an entry inserted, something must have
+  // gone wrong already.
+  assert(!should_flush_);
+  if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
+    prefix_bloom_.reset(new DynamicBloom(
+        options.memtable_prefix_bloom_bits, options.bloom_locality,
+        options.memtable_prefix_bloom_probes, nullptr,
+        options.memtable_prefix_bloom_huge_page_tlb_size,
+        options.info_log.get()));
+  }
+}
+
+MemTable::~MemTable() {
+  assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() {
+  size_t arena_usage = arena_.ApproximateMemoryUsage();
+  size_t table_usage = table_->ApproximateMemoryUsage();
+  // let MAX_USAGE =  std::numeric_limits<size_t>::max()
+  // then if arena_usage + total_usage >= MAX_USAGE, return MAX_USAGE.
+  // the following variation is to avoid numeric overflow.
+  if (arena_usage >= std::numeric_limits<size_t>::max() - table_usage) {
+    return std::numeric_limits<size_t>::max();
+  }
+  // otherwise, return the actual usage
+  return arena_usage + table_usage;
+}
+
+bool MemTable::ShouldFlushNow() const {
+  // In a lot of times, we cannot allocate arena blocks that exactly matches the
+  // buffer size. Thus we have to decide if we should over-allocate or
+  // under-allocate.
+  // This constant avariable can be interpreted as: if we still have more than
+  // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
+  // allocate one more block.
+  const double kAllowOverAllocationRatio = 0.6;
+
+  // If arena still have room for new block allocation, we can safely say it
+  // shouldn't flush.
+  auto allocated_memory =
+      table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes();
+
+  // if we can still allocate one more block without exceeding the
+  // over-allocation ratio, then we should not flush.
+  if (allocated_memory + kArenaBlockSize <
+      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+    return false;
+  }
+
+  // if user keeps adding entries that exceeds kWriteBufferSize, we need to
+  // flush earlier even though we still have much available memory left.
+  if (allocated_memory >
+      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+    return true;
+  }
+
+  // In this code path, Arena has already allocated its "last block", which
+  // means the total allocatedmemory size is either:
+  //  (1) "moderately" over allocated the memory (no more than `0.6 * arena
+  // block size`. Or,
+  //  (2) the allocated memory is less than write buffer size, but we'll stop
+  // here since if we allocate a new arena block, we'll over allocate too much
+  // more (half of the arena block size) memory.
+  //
+  // In either case, to avoid over-allocate, the last block will stop allocation
+  // when its usage reaches a certain ratio, which we carefully choose "0.75
+  // full" as the stop condition because it addresses the following issue with
+  // great simplicity: What if the next inserted entry's size is
+  // bigger than AllocatedAndUnused()?
+  //
+  // The answer is: if the entry size is also bigger than 0.25 *
+  // kArenaBlockSize, a dedicated block will be allocated for it; otherwise
+  // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
+  // and regular block. In either case, we *overly* over-allocated.
+  //
+  // Therefore, setting the last block to be at most "0.75 full" avoids both
+  // cases.
+  //
+  // NOTE: the average percentage of waste space of this approach can be counted
+  // as: "arena block size * 0.25 / write buffer size". User who specify a small
+  // write buffer size and/or big arena block size may suffer.
+  return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
+                                        const char* prefix_len_key2) const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
+  Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
+  return comparator.Compare(k1, k2);
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key,
+                                        const Slice& key)
+    const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice a = GetLengthPrefixedSlice(prefix_len_key);
+  return comparator.Compare(a, key);
+}
+
+Slice MemTableRep::UserKey(const char* key) const {
+  Slice slice = GetLengthPrefixedSlice(key);
+  return Slice(slice.data(), slice.size() - 8);
+}
+
+KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
+  *buf = arena_->Allocate(len);
+  return static_cast<KeyHandle>(*buf);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+const char* EncodeKey(std::string* scratch, const Slice& target) {
+  scratch->clear();
+  PutVarint32(scratch, target.size());
+  scratch->append(target.data(), target.size());
+  return scratch->data();
+}
+
+class MemTableIterator: public Iterator {
+ public:
+  MemTableIterator(const MemTable& mem, const ReadOptions& options,
+                   bool enforce_total_order, Arena* arena)
+      : bloom_(nullptr),
+        prefix_extractor_(mem.prefix_extractor_),
+        valid_(false),
+        arena_mode_(arena != nullptr) {
+    if (prefix_extractor_ != nullptr && !enforce_total_order) {
+      bloom_ = mem.prefix_bloom_.get();
+      iter_ = mem.table_->GetDynamicPrefixIterator(arena);
+    } else {
+      iter_ = mem.table_->GetIterator(arena);
+    }
+  }
+
+  ~MemTableIterator() {
+    if (arena_mode_) {
+      iter_->~Iterator();
+    } else {
+      delete iter_;
+    }
+  }
+
+  virtual bool Valid() const { return valid_; }
+  virtual void Seek(const Slice& k) {
+    if (bloom_ != nullptr &&
+        !bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) {
+      valid_ = false;
+      return;
+    }
+    iter_->Seek(k, nullptr);
+    valid_ = iter_->Valid();
+  }
+  virtual void SeekToFirst() {
+    iter_->SeekToFirst();
+    valid_ = iter_->Valid();
+  }
+  virtual void SeekToLast() {
+    iter_->SeekToLast();
+    valid_ = iter_->Valid();
+  }
+  virtual void Next() {
+    assert(Valid());
+    iter_->Next();
+    valid_ = iter_->Valid();
+  }
+  virtual void Prev() {
+    assert(Valid());
+    iter_->Prev();
+    valid_ = iter_->Valid();
+  }
+  virtual Slice key() const {
+    assert(Valid());
+    return GetLengthPrefixedSlice(iter_->key());
+  }
+  virtual Slice value() const {
+    assert(Valid());
+    Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+  }
+
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  DynamicBloom* bloom_;
+  const SliceTransform* const prefix_extractor_;
+  MemTableRep::Iterator* iter_;
+  bool valid_;
+  bool arena_mode_;
+
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&);
+  void operator=(const MemTableIterator&);
+};
+
+Iterator* MemTable::NewIterator(const ReadOptions& options,
+                                bool enforce_total_order, Arena* arena) {
+  if (arena == nullptr) {
+    return new MemTableIterator(*this, options, enforce_total_order, nullptr);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
+    return new (mem)
+        MemTableIterator(*this, options, enforce_total_order, arena);
+  }
+}
+
+port::RWMutex* MemTable::GetLock(const Slice& key) {
+  static murmur_hash hash;
+  return &locks_[hash(key) % locks_.size()];
+}
+
+void MemTable::Add(SequenceNumber s, ValueType type,
+                   const Slice& key, /* user key */
+                   const Slice& value) {
+  // Format of an entry is concatenation of:
+  //  key_size     : varint32 of internal_key.size()
+  //  key bytes    : char[internal_key.size()]
+  //  value_size   : varint32 of value.size()
+  //  value bytes  : char[value.size()]
+  size_t key_size = key.size();
+  size_t val_size = value.size();
+  size_t internal_key_size = key_size + 8;
+  const size_t encoded_len =
+      VarintLength(internal_key_size) + internal_key_size +
+      VarintLength(val_size) + val_size;
+  char* buf = nullptr;
+  KeyHandle handle = table_->Allocate(encoded_len, &buf);
+  assert(buf != nullptr);
+  char* p = EncodeVarint32(buf, internal_key_size);
+  memcpy(p, key.data(), key_size);
+  p += key_size;
+  EncodeFixed64(p, (s << 8) | type);
+  p += 8;
+  p = EncodeVarint32(p, val_size);
+  memcpy(p, value.data(), val_size);
+  assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
+  table_->Insert(handle);
+  num_entries_++;
+
+  if (prefix_bloom_) {
+    assert(prefix_extractor_);
+    prefix_bloom_->Add(prefix_extractor_->Transform(key));
+  }
+
+  // The first sequence number inserted into the memtable
+  assert(first_seqno_ == 0 || s > first_seqno_);
+  if (first_seqno_ == 0) {
+    first_seqno_ = s;
+  }
+
+  should_flush_ = ShouldFlushNow();
+}
+
+// Callback from MemTable::Get()
+namespace {
+
+struct Saver {
+  Status* status;
+  const LookupKey* key;
+  bool* found_final_value;  // Is value set correctly? Used by KeyMayExist
+  bool* merge_in_progress;
+  std::string* value;
+  const MergeOperator* merge_operator;
+  // the merge operations encountered;
+  MergeContext* merge_context;
+  MemTable* mem;
+  Logger* logger;
+  Statistics* statistics;
+  bool inplace_update_support;
+};
+}  // namespace
+
+static bool SaveValue(void* arg, const char* entry) {
+  Saver* s = reinterpret_cast<Saver*>(arg);
+  MergeContext* merge_context = s->merge_context;
+  const MergeOperator* merge_operator = s->merge_operator;
+
+  assert(s != nullptr && merge_context != nullptr);
+
+  // entry format is:
+  //    klength  varint32
+  //    userkey  char[klength-8]
+  //    tag      uint64
+  //    vlength  varint32
+  //    value    char[vlength]
+  // Check that it belongs to same user key.  We do not check the
+  // sequence number since the Seek() call above should have skipped
+  // all entries with overly large sequence numbers.
+  uint32_t key_length;
+  const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+  if (s->mem->GetInternalKeyComparator().user_comparator()->Compare(
+          Slice(key_ptr, key_length - 8), s->key->user_key()) == 0) {
+    // Correct user key
+    const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+    switch (static_cast<ValueType>(tag & 0xff)) {
+      case kTypeValue: {
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadLock();
+        }
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+        *(s->status) = Status::OK();
+        if (*(s->merge_in_progress)) {
+          assert(merge_operator);
+          if (!merge_operator->FullMerge(s->key->user_key(), &v,
+                                         merge_context->GetOperands(), s->value,
+                                         s->logger)) {
+            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+            *(s->status) =
+                Status::Corruption("Error: Could not perform merge.");
+          }
+        } else {
+          s->value->assign(v.data(), v.size());
+        }
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->Unlock();
+        }
+        *(s->found_final_value) = true;
+        return false;
+      }
+      case kTypeDeletion: {
+        if (*(s->merge_in_progress)) {
+          assert(merge_operator);
+          *(s->status) = Status::OK();
+          if (!merge_operator->FullMerge(s->key->user_key(), nullptr,
+                                         merge_context->GetOperands(), s->value,
+                                         s->logger)) {
+            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
+            *(s->status) =
+                Status::Corruption("Error: Could not perform merge.");
+          }
+        } else {
+          *(s->status) = Status::NotFound();
+        }
+        *(s->found_final_value) = true;
+        return false;
+      }
+      case kTypeMerge: {
+        std::string merge_result;  // temporary area for merge results later
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+        *(s->merge_in_progress) = true;
+        merge_context->PushOperand(v);
+        return true;
+      }
+      default:
+        assert(false);
+        return true;
+    }
+  }
+
+  // s->state could be Corrupt, merge or notfound
+  return false;
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
+                   MergeContext& merge_context, const Options& options) {
+  PERF_TIMER_AUTO(get_from_memtable_time);
+
+  Slice user_key = key.user_key();
+  bool found_final_value = false;
+  bool merge_in_progress = s->IsMergeInProgress();
+
+  if (prefix_bloom_ &&
+      !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
+    // iter is null if prefix bloom says the key does not exist
+  } else {
+    Saver saver;
+    saver.status = s;
+    saver.found_final_value = &found_final_value;
+    saver.merge_in_progress = &merge_in_progress;
+    saver.key = &key;
+    saver.value = value;
+    saver.status = s;
+    saver.mem = this;
+    saver.merge_context = &merge_context;
+    saver.merge_operator = options.merge_operator.get();
+    saver.logger = options.info_log.get();
+    saver.inplace_update_support = options.inplace_update_support;
+    saver.statistics = options.statistics.get();
+    table_->Get(key, &saver, SaveValue);
+  }
+
+  // No change to value, since we have not yet found a Put/Delete
+  if (!found_final_value && merge_in_progress) {
+    *s = Status::MergeInProgress("");
+  }
+  PERF_TIMER_STOP(get_from_memtable_time);
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
+  return found_final_value;
+}
+
+void MemTable::Update(SequenceNumber seq,
+                      const Slice& key,
+                      const Slice& value) {
+  LookupKey lkey(key, seq);
+  Slice mem_key = lkey.memtable_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(lkey.user_key()));
+  iter->Seek(lkey.internal_key(), mem_key.data());
+
+  if (iter->Valid()) {
+    // entry format is:
+    //    key_length  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+          uint32_t prev_size = prev_value.size();
+          uint32_t new_size = value.size();
+
+          // Update value, if new value size  <= previous value size
+          if (new_size <= prev_size ) {
+            char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                     new_size);
+            WriteLock wl(GetLock(lkey.user_key()));
+            memcpy(p, value.data(), value.size());
+            assert((unsigned)((p + value.size()) - entry) ==
+                   (unsigned)(VarintLength(key_length) + key_length +
+                              VarintLength(value.size()) + value.size()));
+            return;
+          }
+        }
+        default:
+          // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
+          // we don't have enough space for update inplace
+            Add(seq, kTypeValue, key, value);
+            return;
+      }
+    }
+  }
+
+  // key doesn't exist
+  Add(seq, kTypeValue, key, value);
+}
+
+bool MemTable::UpdateCallback(SequenceNumber seq,
+                              const Slice& key,
+                              const Slice& delta,
+                              const Options& options) {
+  LookupKey lkey(key, seq);
+  Slice memkey = lkey.memtable_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(lkey.user_key()));
+  iter->Seek(lkey.internal_key(), memkey.data());
+
+  if (iter->Valid()) {
+    // entry format is:
+    //    key_length  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+          uint32_t  prev_size = prev_value.size();
+
+          char* prev_buffer = const_cast<char*>(prev_value.data());
+          uint32_t  new_prev_size = prev_size;
+
+          std::string str_value;
+          WriteLock wl(GetLock(lkey.user_key()));
+          auto status = options.inplace_callback(prev_buffer, &new_prev_size,
+                                                    delta, &str_value);
+          if (status == UpdateStatus::UPDATED_INPLACE) {
+            // Value already updated by callback.
+            assert(new_prev_size <= prev_size);
+            if (new_prev_size < prev_size) {
+              // overwrite the new prev_size
+              char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                       new_prev_size);
+              if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
+                // shift the value buffer as well.
+                memcpy(p, prev_buffer, new_prev_size);
+              }
+            }
+            RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
+            should_flush_ = ShouldFlushNow();
+            return true;
+          } else if (status == UpdateStatus::UPDATED) {
+            Add(seq, kTypeValue, key, Slice(str_value));
+            RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
+            should_flush_ = ShouldFlushNow();
+            return true;
+          } else if (status == UpdateStatus::UPDATE_FAILED) {
+            // No action required. Return.
+            should_flush_ = ShouldFlushNow();
+            return true;
+          }
+        }
+        default:
+          break;
+      }
+    }
+  }
+  // If the latest value is not kTypeValue
+  // or key doesn't exist
+  return false;
+}
+
+size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
+  Slice memkey = key.memtable_key();
+
+  // A total ordered iterator is costly for some memtablerep (prefix aware
+  // reps). By passing in the user key, we allow efficient iterator creation.
+  // The iterator only needs to be ordered within the same user key.
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetIterator(key.user_key()));
+  iter->Seek(key.internal_key(), memkey.data());
+
+  size_t num_successive_merges = 0;
+
+  for (; iter->Valid(); iter->Next()) {
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+            Slice(iter_key_ptr, key_length - 8), key.user_key()) != 0) {
+      break;
+    }
+
+    const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
+    if (static_cast<ValueType>(tag & 0xff) != kTypeMerge) {
+      break;
+    }
+
+    ++num_successive_merges;
+  }
+
+  return num_successive_merges;
+}
+
+void MemTableRep::Get(const LookupKey& k, void* callback_args,
+                      bool (*callback_func)(void* arg, const char* entry)) {
+  auto iter = GetIterator(k.user_key());
+  for (iter->Seek(k.internal_key(), k.memtable_key().data());
+       iter->Valid() && callback_func(callback_args, iter->key());
+       iter->Next()) {
+  }
+}
+
+}  // namespace rocksdb
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -0,0 +1,222 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <memory>
+#include <deque>
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "db/version_edit.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "util/arena.h"
+#include "util/dynamic_bloom.h"
+
+namespace rocksdb {
+
+class Arena;
+class Mutex;
+class MemTableIterator;
+class MergeContext;
+
+class MemTable {
+ public:
+  struct KeyComparator : public MemTableRep::KeyComparator {
+    const InternalKeyComparator comparator;
+    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const;
+    virtual int operator()(const char* prefix_len_key,
+                           const Slice& key) const override;
+  };
+
+  // MemTables are reference counted.  The initial reference count
+  // is zero and the caller must call Ref() at least once.
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const Options& options);
+
+  ~MemTable();
+
+  // Increase reference count.
+  void Ref() { ++refs_; }
+
+  // Drop reference count.
+  // If the refcount goes to zero return this memtable, otherwise return null
+  MemTable* Unref() {
+    --refs_;
+    assert(refs_ >= 0);
+    if (refs_ <= 0) {
+      return this;
+    }
+    return nullptr;
+  }
+
+  // Returns an estimate of the number of bytes of data in use by this
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  size_t ApproximateMemoryUsage();
+
+  // This method heuristically determines if the memtable should continue to
+  // host more data.
+  bool ShouldFlush() const { return should_flush_; }
+
+  // Return an iterator that yields the contents of the memtable.
+  //
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.  The keys returned by this
+  // iterator are internal keys encoded by AppendInternalKey in the
+  // db/dbformat.{h,cc} module.
+  //
+  // By default, it returns an iterator for prefix seek if prefix_extractor
+  // is configured in Options.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        Calling ~Iterator of the iterator will destroy all the states but
+  //        those allocated in arena.
+  Iterator* NewIterator(const ReadOptions& options,
+                        bool enforce_total_order = false,
+                        Arena* arena = nullptr);
+
+  // Add an entry into memtable that maps key to value at the
+  // specified sequence number and with the specified type.
+  // Typically value will be empty if type==kTypeDeletion.
+  void Add(SequenceNumber seq, ValueType type,
+           const Slice& key,
+           const Slice& value);
+
+  // If memtable contains a value for key, store it in *value and return true.
+  // If memtable contains a deletion for key, store a NotFound() error
+  // in *status and return true.
+  // If memtable contains Merge operation as the most recent entry for a key,
+  //   and the merge process does not stop (not reaching a value or delete),
+  //   prepend the current merge operand to *operands.
+  //   store MergeInProgress in s, and return false.
+  // Else, return false.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext& merge_context, const Options& options);
+
+  // Attempts to update the new_value inplace, else does normal Add
+  // Pseudocode
+  //   if key exists in current memtable && prev_value is of type kTypeValue
+  //     if new sizeof(new_value) <= sizeof(prev_value)
+  //       update inplace
+  //     else add(key, new_value)
+  //   else add(key, new_value)
+  void Update(SequenceNumber seq,
+              const Slice& key,
+              const Slice& value);
+
+  // If prev_value for key exits, attempts to update it inplace.
+  // else returns false
+  // Pseudocode
+  //   if key exists in current memtable && prev_value is of type kTypeValue
+  //     new_value = delta(prev_value)
+  //     if sizeof(new_value) <= sizeof(prev_value)
+  //       update inplace
+  //     else add(key, new_value)
+  //   else return false
+  bool UpdateCallback(SequenceNumber seq,
+                      const Slice& key,
+                      const Slice& delta,
+                      const Options& options);
+
+  // Returns the number of successive merge entries starting from the newest
+  // entry for the key up to the last non-merge entry or last entry for the
+  // key in the memtable.
+  size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
+  // Get total number of entries in the mem table.
+  uint64_t GetNumEntries() const { return num_entries_; }
+
+  // Returns the edits area that is needed for flushing the memtable
+  VersionEdit* GetEdits() { return &edit_; }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable
+  SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
+
+  // Returns the next active logfile number when this memtable is about to
+  // be flushed to storage
+  uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
+
+  // Sets the next active logfile number when this memtable is about to
+  // be flushed to storage
+  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+
+  // Notify the underlying storage that no more items will be added
+  void MarkImmutable() { table_->MarkReadOnly(); }
+
+  // return true if the current MemTableRep supports merge operator.
+  bool IsMergeOperatorSupported() const {
+    return table_->IsMergeOperatorSupported();
+  }
+
+  // return true if the current MemTableRep supports snapshots.
+  bool IsSnapshotSupported() const { return table_->IsSnapshotSupported(); }
+
+  // Get the lock associated for the key
+  port::RWMutex* GetLock(const Slice& key);
+
+  const InternalKeyComparator& GetInternalKeyComparator() const {
+    return comparator_.comparator;
+  }
+
+  const Arena& TEST_GetArena() const { return arena_; }
+
+ private:
+  // Dynamically check if we can add more incoming entries.
+  bool ShouldFlushNow() const;
+
+  friend class MemTableIterator;
+  friend class MemTableBackwardIterator;
+  friend class MemTableList;
+
+  KeyComparator comparator_;
+  int refs_;
+  const size_t kArenaBlockSize;
+  const size_t kWriteBufferSize;
+  Arena arena_;
+  unique_ptr<MemTableRep> table_;
+
+  uint64_t num_entries_;
+
+  // These are used to manage memtable flushes to storage
+  bool flush_in_progress_; // started the flush
+  bool flush_completed_;   // finished the flush
+  uint64_t file_number_;    // filled up after flush is complete
+
+  // The updates to be applied to the transaction log when this
+  // memtable is flushed to storage.
+  VersionEdit edit_;
+
+  // The sequence number of the kv that was inserted first
+  SequenceNumber first_seqno_;
+
+  // The log files earlier than this number can be deleted.
+  uint64_t mem_next_logfile_number_;
+
+  // rw locks for inplace updates
+  std::vector<port::RWMutex> locks_;
+
+  // No copying allowed
+  MemTable(const MemTable&);
+  void operator=(const MemTable&);
+
+  const SliceTransform* const prefix_extractor_;
+  std::unique_ptr<DynamicBloom> prefix_bloom_;
+
+  // a flag indicating if a memtable has met the criteria to flush
+  bool should_flush_;
+};
+
+extern const char* EncodeKey(std::string* scratch, const Slice& target);
+
+}  // namespace rocksdb
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -0,0 +1,286 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "db/memtable_list.h"
+
+#include <string>
+#include "rocksdb/db.h"
+#include "db/memtable.h"
+#include "db/version_set.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "table/merger.h"
+#include "util/coding.h"
+#include "util/log_buffer.h"
+
+namespace rocksdb {
+
+class InternalKeyComparator;
+class Mutex;
+class VersionSet;
+
+MemTableListVersion::MemTableListVersion(MemTableListVersion* old) {
+  if (old != nullptr) {
+    memlist_ = old->memlist_;
+    size_ = old->size_;
+    for (auto& m : memlist_) {
+      m->Ref();
+    }
+  }
+}
+
+void MemTableListVersion::Ref() { ++refs_; }
+
+void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    // if to_delete is equal to nullptr it means we're confident
+    // that refs_ will not be zero
+    assert(to_delete != nullptr);
+    for (const auto& m : memlist_) {
+      MemTable* x = m->Unref();
+      if (x != nullptr) {
+        to_delete->push_back(x);
+      }
+    }
+    delete this;
+  }
+}
+
+int MemTableListVersion::size() const { return size_; }
+
+// Returns the total number of memtables in the list
+int MemTableList::size() const {
+  assert(num_flush_not_started_ <= current_->size_);
+  return current_->size_;
+}
+
+// Search all the memtables starting from the most recent one.
+// Return the most recent value found, if any.
+// Operands stores the list of merge operations to apply, so far.
+bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
+                              Status* s, MergeContext& merge_context,
+                              const Options& options) {
+  for (auto& memtable : memlist_) {
+    if (memtable->Get(key, value, s, merge_context, options)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void MemTableListVersion::AddIterators(const ReadOptions& options,
+                                       std::vector<Iterator*>* iterator_list) {
+  for (auto& m : memlist_) {
+    iterator_list->push_back(m->NewIterator(options));
+  }
+}
+
+void MemTableListVersion::AddIterators(
+    const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder) {
+  for (auto& m : memlist_) {
+    merge_iter_builder->AddIterator(
+        m->NewIterator(options, merge_iter_builder->GetArena()));
+  }
+}
+
+uint64_t MemTableListVersion::GetTotalNumEntries() const {
+  uint64_t total_num = 0;
+  for (auto& m : memlist_) {
+    total_num += m->GetNumEntries();
+  }
+  return total_num;
+}
+
+// caller is responsible for referencing m
+void MemTableListVersion::Add(MemTable* m) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  memlist_.push_front(m);
+  ++size_;
+}
+
+// caller is responsible for unreferencing m
+void MemTableListVersion::Remove(MemTable* m) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  memlist_.remove(m);
+  --size_;
+}
+
+// Returns true if there is at least one memtable on which flush has
+// not yet started.
+bool MemTableList::IsFlushPending() const {
+  if ((flush_requested_ && num_flush_not_started_ >= 1) ||
+      (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
+    assert(imm_flush_needed.NoBarrier_Load() != nullptr);
+    return true;
+  }
+  return false;
+}
+
+// Returns the memtables that need to be flushed.
+void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
+  const auto& memlist = current_->memlist_;
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* m = *it;
+    if (!m->flush_in_progress_) {
+      assert(!m->flush_completed_);
+      num_flush_not_started_--;
+      if (num_flush_not_started_ == 0) {
+        imm_flush_needed.Release_Store(nullptr);
+      }
+      m->flush_in_progress_ = true;  // flushing will start very soon
+      ret->push_back(m);
+    }
+  }
+  flush_requested_ = false;  // start-flush request is complete
+}
+
+void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
+                                         uint64_t file_number,
+                                         std::set<uint64_t>* pending_outputs) {
+  assert(!mems.empty());
+
+  // If the flush was not successful, then just reset state.
+  // Maybe a suceeding attempt to flush will be successful.
+  for (MemTable* m : mems) {
+    assert(m->flush_in_progress_);
+    assert(m->file_number_ == 0);
+
+    m->flush_in_progress_ = false;
+    m->flush_completed_ = false;
+    m->edit_.Clear();
+    num_flush_not_started_++;
+  }
+  pending_outputs->erase(file_number);
+  imm_flush_needed.Release_Store(reinterpret_cast<void *>(1));
+}
+
+// Record a successful flush in the manifest file
+Status MemTableList::InstallMemtableFlushResults(
+    ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
+    port::Mutex* mu, Logger* info_log, uint64_t file_number,
+    std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
+    Directory* db_directory, LogBuffer* log_buffer) {
+  mu->AssertHeld();
+
+  // flush was sucessful
+  for (size_t i = 0; i < mems.size(); ++i) {
+    // All the edits are associated with the first memtable of this batch.
+    assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
+
+    mems[i]->flush_completed_ = true;
+    mems[i]->file_number_ = file_number;
+  }
+
+  // if some other thread is already commiting, then return
+  Status s;
+  if (commit_in_progress_) {
+    return s;
+  }
+
+  // Only a single thread can be executing this piece of code
+  commit_in_progress_ = true;
+
+  // scan all memtables from the earliest, and commit those
+  // (in that order) that have finished flushing. Memetables
+  // are always committed in the order that they were created.
+  while (!current_->memlist_.empty() && s.ok()) {
+    MemTable* m = current_->memlist_.back();  // get the last element
+    if (!m->flush_completed_) {
+      break;
+    }
+
+    LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started",
+                cfd->GetName().c_str(), (unsigned long)m->file_number_);
+
+    // this can release and reacquire the mutex.
+    s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
+
+    // we will be changing the version in the next code path,
+    // so we better create a new one, since versions are immutable
+    InstallNewVersion();
+
+    // All the later memtables that have the same filenum
+    // are part of the same batch. They can be committed now.
+    uint64_t mem_id = 1;  // how many memtables has been flushed.
+    do {
+      if (s.ok()) { // commit new state
+        LogToBuffer(log_buffer,
+                    "[%s] Level-0 commit table #%lu: memtable #%lu done",
+                    cfd->GetName().c_str(), (unsigned long)m->file_number_,
+                    (unsigned long)mem_id);
+        current_->Remove(m);
+        assert(m->file_number_ > 0);
+
+        // pending_outputs can be cleared only after the newly created file
+        // has been written to a committed version so that other concurrently
+        // executing compaction threads do not mistakenly assume that this
+        // file is not live.
+        pending_outputs.erase(m->file_number_);
+        if (m->Unref() != nullptr) {
+          to_delete->push_back(m);
+        }
+      } else {
+        //commit failed. setup state so that we can flush again.
+        Log(info_log,
+            "Level-0 commit table #%lu: memtable #%lu failed",
+            (unsigned long)m->file_number_,
+            (unsigned long)mem_id);
+        m->flush_completed_ = false;
+        m->flush_in_progress_ = false;
+        m->edit_.Clear();
+        num_flush_not_started_++;
+        pending_outputs.erase(m->file_number_);
+        m->file_number_ = 0;
+        imm_flush_needed.Release_Store((void *)1);
+      }
+      ++mem_id;
+    } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) &&
+             m->file_number_ == file_number);
+  }
+  commit_in_progress_ = false;
+  return s;
+}
+
+// New memtables are inserted at the front of the list.
+void MemTableList::Add(MemTable* m) {
+  assert(current_->size_ >= num_flush_not_started_);
+  InstallNewVersion();
+  // this method is used to move mutable memtable into an immutable list.
+  // since mutable memtable is already refcounted by the DBImpl,
+  // and when moving to the imutable list we don't unref it,
+  // we don't have to ref the memtable here. we just take over the
+  // reference from the DBImpl.
+  current_->Add(m);
+  m->MarkImmutable();
+  num_flush_not_started_++;
+  if (num_flush_not_started_ == 1) {
+    imm_flush_needed.Release_Store((void *)1);
+  }
+}
+
+// Returns an estimate of the number of bytes of data in use.
+size_t MemTableList::ApproximateMemoryUsage() {
+  size_t size = 0;
+  for (auto& memtable : current_->memlist_) {
+    size += memtable->ApproximateMemoryUsage();
+  }
+  return size;
+}
+
+void MemTableList::InstallNewVersion() {
+  if (current_->refs_ == 1) {
+    // we're the only one using the version, just keep using it
+  } else {
+    // somebody else holds the current version, we need to create new one
+    MemTableListVersion* version = current_;
+    current_ = new MemTableListVersion(current_);
+    current_->Ref();
+    version->Unref();
+  }
+}
+
+}  // namespace rocksdb
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -0,0 +1,156 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+
+#include <string>
+#include <list>
+#include <vector>
+#include <set>
+#include <deque>
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/iterator.h"
+
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "db/memtable.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "util/autovector.h"
+#include "util/log_buffer.h"
+
+namespace rocksdb {
+
+class ColumnFamilyData;
+class InternalKeyComparator;
+class Mutex;
+class MergeIteratorBuilder;
+
+// keeps a list of immutable memtables in a vector. the list is immutable
+// if refcount is bigger than one. It is used as a state for Get() and
+// Iterator code paths
+class MemTableListVersion {
+ public:
+  explicit MemTableListVersion(MemTableListVersion* old = nullptr);
+
+  void Ref();
+  void Unref(autovector<MemTable*>* to_delete = nullptr);
+
+  int size() const;
+
+  // Search all the memtables starting from the most recent one.
+  // Return the most recent value found, if any.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext& merge_context, const Options& options);
+
+  void AddIterators(const ReadOptions& options,
+                    std::vector<Iterator*>* iterator_list);
+
+  void AddIterators(const ReadOptions& options,
+                    MergeIteratorBuilder* merge_iter_builder);
+
+  uint64_t GetTotalNumEntries() const;
+
+ private:
+  // REQUIRE: m is mutable memtable
+  void Add(MemTable* m);
+  // REQUIRE: m is mutable memtable
+  void Remove(MemTable* m);
+
+  friend class MemTableList;
+  std::list<MemTable*> memlist_;
+  int size_ = 0;
+  int refs_ = 0;
+};
+
+// This class stores references to all the immutable memtables.
+// The memtables are flushed to L0 as soon as possible and in
+// any order. If there are more than one immutable memtable, their
+// flushes can occur concurrently.  However, they are 'committed'
+// to the manifest in FIFO order to maintain correctness and
+// recoverability from a crash.
+class MemTableList {
+ public:
+  // A list of memtables.
+  explicit MemTableList(int min_write_buffer_number_to_merge)
+      : min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+        current_(new MemTableListVersion()),
+        num_flush_not_started_(0),
+        commit_in_progress_(false),
+        flush_requested_(false) {
+    imm_flush_needed.Release_Store(nullptr);
+    current_->Ref();
+  }
+  ~MemTableList() {}
+
+  MemTableListVersion* current() { return current_; }
+
+  // so that background threads can detect non-nullptr pointer to
+  // determine whether there is anything more to start flushing.
+  port::AtomicPointer imm_flush_needed;
+
+  // Returns the total number of memtables in the list
+  int size() const;
+
+  // Returns true if there is at least one memtable on which flush has
+  // not yet started.
+  bool IsFlushPending() const;
+
+  // Returns the earliest memtables that needs to be flushed. The returned
+  // memtables are guaranteed to be in the ascending order of created time.
+  void PickMemtablesToFlush(autovector<MemTable*>* mems);
+
+  // Reset status of the given memtable list back to pending state so that
+  // they can get picked up again on the next round of flush.
+  void RollbackMemtableFlush(const autovector<MemTable*>& mems,
+                             uint64_t file_number,
+                             std::set<uint64_t>* pending_outputs);
+
+  // Commit a successful flush in the manifest file
+  Status InstallMemtableFlushResults(ColumnFamilyData* cfd,
+                                     const autovector<MemTable*>& m,
+                                     VersionSet* vset, port::Mutex* mu,
+                                     Logger* info_log, uint64_t file_number,
+                                     std::set<uint64_t>& pending_outputs,
+                                     autovector<MemTable*>* to_delete,
+                                     Directory* db_directory,
+                                     LogBuffer* log_buffer);
+
+  // New memtables are inserted at the front of the list.
+  // Takes ownership of the referenced held on *m by the caller of Add().
+  void Add(MemTable* m);
+
+  // Returns an estimate of the number of bytes of data in use.
+  size_t ApproximateMemoryUsage();
+
+  // Request a flush of all existing memtables to storage
+  void FlushRequested() { flush_requested_ = true; }
+
+  // Copying allowed
+  // MemTableList(const MemTableList&);
+  // void operator=(const MemTableList&);
+
+ private:
+  // DB mutex held
+  void InstallNewVersion();
+
+  int min_write_buffer_number_to_merge_;
+
+  MemTableListVersion* current_;
+
+  // the number of elements that still need flushing
+  int num_flush_not_started_;
+
+  // committing in progress
+  bool commit_in_progress_;
+
+  // Requested a flush of all memtables to storage
+  bool flush_requested_;
+
+};
+
+}  // namespace rocksdb
--- a/db/merge_context.h
+++ b/db/merge_context.h
@@ -0,0 +1,69 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include <string>
+#include <deque>
+
+namespace rocksdb {
+
+const std::deque<std::string> empty_operand_list;
+
+// The merge context for merging a user key.
+// When doing a Get(), DB will create such a class and pass it when
+// issuing Get() operation to memtables and version_set. The operands
+// will be fetched from the context when issuing partial of full merge.
+class MergeContext {
+public:
+  // Clear all the operands
+  void Clear() {
+    if (operand_list) {
+      operand_list->clear();
+    }
+  }
+  // Replace all operands with merge_result, which are expected to be the
+  // merge result of them.
+  void PushPartialMergeResult(std::string& merge_result) {
+    assert (operand_list);
+    operand_list->clear();
+    operand_list->push_front(std::move(merge_result));
+  }
+  // Push a merge operand
+  void PushOperand(const Slice& operand_slice) {
+    Initialize();
+    operand_list->push_front(operand_slice.ToString());
+  }
+  // return total number of operands in the list
+  size_t GetNumOperands() const {
+    if (!operand_list) {
+      return 0;
+    }
+    return operand_list->size();
+  }
+  // Get the operand at the index.
+  Slice GetOperand(int index) const {
+    assert (operand_list);
+    return (*operand_list)[index];
+  }
+  // Return all the operands.
+  const std::deque<std::string>& GetOperands() const {
+    if (!operand_list) {
+      return empty_operand_list;
+    }
+    return *operand_list;
+  }
+private:
+  void Initialize() {
+    if (!operand_list) {
+      operand_list.reset(new std::deque<std::string>());
+    }
+  }
+  std::unique_ptr<std::deque<std::string>> operand_list;
+};
+
+} // namespace rocksdb
+
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -0,0 +1,209 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "merge_helper.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "util/statistics.h"
+#include <string>
+#include <stdio.h>
+
+namespace rocksdb {
+
+// PRE:  iter points to the first merge type entry
+// POST: iter points to the first entry beyond the merge process (or the end)
+//       keys_, operands_ are updated to reflect the merge result.
+//       keys_ stores the list of keys encountered while merging.
+//       operands_ stores the list of merge operands encountered while merging.
+//       keys_[i] corresponds to operands_[i] for each i.
+void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
+                             bool at_bottom, Statistics* stats, int* steps) {
+  // Get a copy of the internal key, before it's invalidated by iter->Next()
+  // Also maintain the list of merge operands seen.
+  keys_.clear();
+  operands_.clear();
+  keys_.push_front(iter->key().ToString());
+  operands_.push_front(iter->value().ToString());
+
+  success_ = false;   // Will become true if we hit Put/Delete or bottom
+
+  // We need to parse the internal key again as the parsed key is
+  // backed by the internal key!
+  // Assume no internal key corruption as it has been successfully parsed
+  // by the caller.
+  // Invariant: keys_.back() will not change. Hence, orig_ikey is always valid.
+  ParsedInternalKey orig_ikey;
+  ParseInternalKey(keys_.back(), &orig_ikey);
+
+  bool hit_the_next_user_key = false;
+  std::string merge_result;  // Temporary value for merge results
+  if (steps) {
+    ++(*steps);
+  }
+  for (iter->Next(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey ikey;
+    assert(operands_.size() >= 1);        // Should be invariants!
+    assert(keys_.size() == operands_.size());
+
+    if (!ParseInternalKey(iter->key(), &ikey)) {
+      // stop at corrupted key
+      if (assert_valid_internal_key_) {
+        assert(!"corrupted internal key is not expected");
+      }
+      break;
+    }
+
+    if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) {
+      // hit a different user key, stop right here
+      hit_the_next_user_key = true;
+      break;
+    }
+
+    if (stop_before && ikey.sequence <= stop_before) {
+      // hit an entry that's visible by the previous snapshot, can't touch that
+      break;
+    }
+
+    // At this point we are guaranteed that we need to process this key.
+
+    if (kTypeDeletion == ikey.type) {
+      // hit a delete
+      //   => merge nullptr with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Return a success if the merge passes.
+      success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr,
+                                                 operands_, &merge_result,
+                                                 logger_);
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (success_) {
+        std::string& key = keys_.back();  // The original key encountered
+        orig_ikey.type = kTypeValue;
+        UpdateInternalKey(&key[0], key.size(),
+                          orig_ikey.sequence, orig_ikey.type);
+        swap(operands_.back(), merge_result);
+      } else {
+        RecordTick(stats, NUMBER_MERGE_FAILURES);
+      }
+
+      // move iter to the next entry (before doing anything else)
+      iter->Next();
+      if (steps) {
+        ++(*steps);
+      }
+      return;
+    }
+
+    if (kTypeValue == ikey.type) {
+      // hit a put
+      //   => merge the put value with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Success!
+      const Slice value = iter->value();
+      success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
+                                                 operands_, &merge_result,
+                                                 logger_);
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (success_) {
+        std::string& key = keys_.back();  // The original key encountered
+        orig_ikey.type = kTypeValue;
+        UpdateInternalKey(&key[0], key.size(),
+                          orig_ikey.sequence, orig_ikey.type);
+        swap(operands_.back(), merge_result);
+      } else {
+        RecordTick(stats, NUMBER_MERGE_FAILURES);
+      }
+
+      // move iter to the next entry
+      iter->Next();
+      if (steps) {
+        ++(*steps);
+      }
+      return;
+    }
+
+    if (kTypeMerge == ikey.type) {
+      // hit a merge
+      //   => merge the operand into the front of the operands_ list
+      //   => use the user's associative merge function to determine how.
+      //   => then continue because we haven't yet seen a Put/Delete.
+      assert(!operands_.empty()); // Should have at least one element in it
+
+      // keep queuing keys and operands until we either meet a put / delete
+      // request or later did a partial merge.
+      keys_.push_front(iter->key().ToString());
+      operands_.push_front(iter->value().ToString());
+      if (steps) {
+        ++(*steps);
+      }
+    }
+  }
+
+  // We are sure we have seen this key's entire history if we are at the
+  // last level and exhausted all internal keys of this user key.
+  // NOTE: !iter->Valid() does not necessarily mean we hit the
+  // beginning of a user key, as versions of a user key might be
+  // split into multiple files (even files on the same level)
+  // and some files might not be included in the compaction/merge.
+  //
+  // There are also cases where we have seen the root of history of this
+  // key without being sure of it. Then, we simply miss the opportunity
+  // to combine the keys. Since VersionSet::SetupOtherInputs() always makes
+  // sure that all merge-operands on the same level get compacted together,
+  // this will simply lead to these merge operands moving to the next level.
+  //
+  // So, we only perform the following logic (to merge all operands together
+  // without a Put/Delete) if we are certain that we have seen the end of key.
+  bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom;
+  if (surely_seen_the_beginning) {
+    // do a final merge with nullptr as the existing value and say
+    // bye to the merge type (it's now converted to a Put)
+    assert(kTypeMerge == orig_ikey.type);
+    assert(operands_.size() >= 1);
+    assert(operands_.size() == keys_.size());
+    success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr,
+                                               operands_, &merge_result,
+                                               logger_);
+
+    if (success_) {
+      std::string& key = keys_.back();  // The original key encountered
+      orig_ikey.type = kTypeValue;
+      UpdateInternalKey(&key[0], key.size(),
+                        orig_ikey.sequence, orig_ikey.type);
+
+      // The final value() is always stored in operands_.back()
+      swap(operands_.back(),merge_result);
+    } else {
+      RecordTick(stats, NUMBER_MERGE_FAILURES);
+      // Do nothing if not success_. Leave keys() and operands() as they are.
+    }
+  } else {
+    // We haven't seen the beginning of the key nor a Put/Delete.
+    // Attempt to use the user's associative merge function to
+    // merge the stacked merge operands into a single operand.
+
+    if (operands_.size() >= 2 &&
+        operands_.size() >= min_partial_merge_operands_ &&
+        user_merge_operator_->PartialMergeMulti(
+            orig_ikey.user_key,
+            std::deque<Slice>(operands_.begin(), operands_.end()),
+            &merge_result, logger_)) {
+      // Merging of operands (associative merge) was successful.
+      // Replace operands with the merge result
+      operands_.clear();
+      operands_.push_front(std::move(merge_result));
+      keys_.erase(keys_.begin(), keys_.end() - 1);
+    }
+  }
+}
+
+} // namespace rocksdb
--- a/db/merge_helper.h
+++ b/db/merge_helper.h
@@ -0,0 +1,105 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef MERGE_HELPER_H
+#define MERGE_HELPER_H
+
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include <string>
+#include <deque>
+
+namespace rocksdb {
+
+class Comparator;
+class Iterator;
+class Logger;
+class MergeOperator;
+class Statistics;
+
+class MergeHelper {
+ public:
+  MergeHelper(const Comparator* user_comparator,
+              const MergeOperator* user_merge_operator, Logger* logger,
+              unsigned min_partial_merge_operands,
+              bool assert_valid_internal_key)
+      : user_comparator_(user_comparator),
+        user_merge_operator_(user_merge_operator),
+        logger_(logger),
+        min_partial_merge_operands_(min_partial_merge_operands),
+        assert_valid_internal_key_(assert_valid_internal_key),
+        keys_(),
+        operands_(),
+        success_(false) {}
+
+  // Merge entries until we hit
+  //     - a corrupted key
+  //     - a Put/Delete,
+  //     - a different user key,
+  //     - a specific sequence number (snapshot boundary),
+  //  or - the end of iteration
+  // iter: (IN)  points to the first merge type entry
+  //       (OUT) points to the first entry not included in the merge process
+  // stop_before: (IN) a sequence number that merge should not cross.
+  //                   0 means no restriction
+  // at_bottom:   (IN) true if the iterator covers the bottem level, which means
+  //                   we could reach the start of the history of this user key.
+  void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0,
+                  bool at_bottom = false, Statistics* stats = nullptr,
+                  int* steps = nullptr);
+
+  // Query the merge result
+  // These are valid until the next MergeUntil call
+  // If the merging was successful:
+  //   - IsSuccess() will be true
+  //   - key() will have the latest sequence number of the merges.
+  //           The type will be Put or Merge. See IMPORTANT 1 note, below.
+  //   - value() will be the result of merging all the operands together
+  //   - The user should ignore keys() and values().
+  //
+  //   IMPORTANT 1: the key type could change after the MergeUntil call.
+  //        Put/Delete + Merge + ... + Merge => Put
+  //        Merge + ... + Merge => Merge
+  //
+  // If the merge operator is not associative, and if a Put/Delete is not found
+  // then the merging will be unsuccessful. In this case:
+  //   - IsSuccess() will be false
+  //   - keys() contains the list of internal keys seen in order of iteration.
+  //   - values() contains the list of values (merges) seen in the same order.
+  //              values() is parallel to keys() so that the first entry in
+  //              keys() is the key associated with the first entry in values()
+  //              and so on. These lists will be the same length.
+  //              All of these pairs will be merges over the same user key.
+  //              See IMPORTANT 2 note below.
+  //   - The user should ignore key() and value().
+  //
+  //   IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
+  //                So keys().back() was the first key seen by iterator.
+  // TODO: Re-style this comment to be like the first one
+  bool IsSuccess() { return success_; }
+  Slice key() { assert(success_); return Slice(keys_.back()); }
+  Slice value() { assert(success_); return Slice(operands_.back()); }
+  const std::deque<std::string>& keys() { assert(!success_); return keys_; }
+  const std::deque<std::string>& values() {
+    assert(!success_); return operands_;
+  }
+
+ private:
+  const Comparator* user_comparator_;
+  const MergeOperator* user_merge_operator_;
+  Logger* logger_;
+  unsigned min_partial_merge_operands_;
+  bool assert_valid_internal_key_; // enforce no internal key corruption?
+
+  // the scratch area that holds the result of MergeUntil
+  // valid up to the next MergeUntil call
+  std::deque<std::string> keys_;    // Keeps track of the sequence of keys seen
+  std::deque<std::string> operands_;  // Parallel with keys_; stores the values
+  bool success_;
+};
+
+} // namespace rocksdb
+
+#endif
--- a/db/merge_operator.cc
+++ b/db/merge_operator.cc
@@ -0,0 +1,77 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+/**
+ * Back-end implementation details specific to the Merge Operator.
+ */
+
+#include "rocksdb/merge_operator.h"
+
+namespace rocksdb {
+
+// The default implementation of PartialMergeMulti, which invokes
+// PartialMerge multiple times internally and merges two operands at
+// a time.
+bool MergeOperator::PartialMergeMulti(const Slice& key,
+                                      const std::deque<Slice>& operand_list,
+                                      std::string* new_value,
+                                      Logger* logger) const {
+  assert(operand_list.size() >= 2);
+  // Simply loop through the operands
+  std::string temp_value;
+  Slice temp_slice(operand_list[0]);
+
+  for (size_t i = 1; i < operand_list.size(); ++i) {
+    auto& operand = operand_list[i];
+    if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) {
+      return false;
+    }
+    swap(temp_value, *new_value);
+    temp_slice = Slice(*new_value);
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Given a "real" merge from the library, call the user's
+// associative merge function one-by-one on each of the operands.
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::FullMerge(
+    const Slice& key,
+    const Slice* existing_value,
+    const std::deque<std::string>& operand_list,
+    std::string* new_value,
+    Logger* logger) const {
+
+  // Simply loop through the operands
+  Slice temp_existing;
+  std::string temp_value;
+  for (const auto& operand : operand_list) {
+    Slice value(operand);
+    if (!Merge(key, existing_value, value, &temp_value, logger)) {
+      return false;
+    }
+    swap(temp_value, *new_value);
+    temp_existing = Slice(*new_value);
+    existing_value = &temp_existing;
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Call the user defined simple merge on the operands;
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::PartialMerge(
+    const Slice& key,
+    const Slice& left_operand,
+    const Slice& right_operand,
+    std::string* new_value,
+    Logger* logger) const {
+  return Merge(key, &left_operand, right_operand, new_value, logger);
+}
+
+} // namespace rocksdb
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -0,0 +1,472 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <assert.h>
+#include <memory>
+#include <iostream>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/write_batch_internal.h"
+#include "utilities/merge_operators.h"
+#include "util/testharness.h"
+#include "utilities/db_ttl.h"
+
+using namespace std;
+using namespace rocksdb;
+
+namespace {
+  int numMergeOperatorCalls;
+  void resetNumMergeOperatorCalls() {
+    numMergeOperatorCalls = 0;
+  }
+
+  int num_partial_merge_calls;
+  void resetNumPartialMergeCalls() {
+    num_partial_merge_calls = 0;
+  }
+}
+
+class CountMergeOperator : public AssociativeMergeOperator {
+ public:
+  CountMergeOperator() {
+    mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+  }
+
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const override {
+    ++numMergeOperatorCalls;
+    if (existing_value == nullptr) {
+      new_value->assign(value.data(), value.size());
+      return true;
+    }
+
+    return mergeOperator_->PartialMerge(
+        key,
+        *existing_value,
+        value,
+        new_value,
+        logger);
+  }
+
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value, Logger* logger) const {
+    ++num_partial_merge_calls;
+    return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
+                                             logger);
+  }
+
+  virtual const char* Name() const override {
+    return "UInt64AddOperator";
+  }
+
+ private:
+  std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+namespace {
+std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false,
+                           const size_t max_successive_merges = 0,
+                           const uint32_t min_partial_merge_operands = 2) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = std::make_shared<CountMergeOperator>();
+  options.max_successive_merges = max_successive_merges;
+  options.min_partial_merge_operands = min_partial_merge_operands;
+  Status s;
+  DestroyDB(dbname, Options());
+  if (ttl) {
+    cout << "Opening database with TTL\n";
+    DBWithTTL* db_with_ttl;
+    s = DBWithTTL::Open(options, dbname, &db_with_ttl);
+    db = db_with_ttl;
+  } else {
+    s = DB::Open(options, dbname, &db);
+  }
+  if (!s.ok()) {
+    cerr << s.ToString() << endl;
+    assert(false);
+  }
+  return std::shared_ptr<DB>(db);
+}
+}  // namespace
+
+// Imagine we are maintaining a set of uint64 counters.
+// Each counter has a distinct name. And we would like
+// to support four high level operations:
+// set, add, get and remove
+// This is a quick implementation without a Merge operation.
+class Counters {
+
+ protected:
+  std::shared_ptr<DB> db_;
+
+  WriteOptions put_option_;
+  ReadOptions get_option_;
+  WriteOptions delete_option_;
+
+  uint64_t default_;
+
+ public:
+  explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : db_(db),
+        put_option_(),
+        get_option_(),
+        delete_option_(),
+        default_(defaultCount) {
+    assert(db_);
+  }
+
+  virtual ~Counters() {}
+
+  // public interface of Counters.
+  // All four functions return false
+  // if the underlying level db operation failed.
+
+  // mapped to a levedb Put
+  bool set(const string& key, uint64_t value) {
+    // just treat the internal rep of int64 as the string
+    Slice slice((char *)&value, sizeof(value));
+    auto s = db_->Put(put_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Delete
+  bool remove(const string& key) {
+    auto s = db_->Delete(delete_option_, key);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Get
+  bool get(const string& key, uint64_t *value) {
+    string str;
+    auto s = db_->Get(get_option_, key, &str);
+
+    if (s.IsNotFound()) {
+      // return default value if not found;
+      *value = default_;
+      return true;
+    } else if (s.ok()) {
+      // deserialization
+      if (str.size() != sizeof(uint64_t)) {
+        cerr << "value corruption\n";
+        return false;
+      }
+      *value = DecodeFixed64(&str[0]);
+      return true;
+    } else {
+      cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // 'add' is implemented as get -> modify -> set
+  // An alternative is a single merge operation, see MergeBasedCounters
+  virtual bool add(const string& key, uint64_t value) {
+    uint64_t base = default_;
+    return get(key, &base) && set(key, base + value);
+  }
+
+
+  // convenience functions for testing
+  void assert_set(const string& key, uint64_t value) {
+    assert(set(key, value));
+  }
+
+  void assert_remove(const string& key) {
+    assert(remove(key));
+  }
+
+  uint64_t assert_get(const string& key) {
+    uint64_t value = default_;
+    int result = get(key, &value);
+    assert(result);
+    if (result == 0) exit(1); // Disable unused variable warning.
+    return value;
+  }
+
+  void assert_add(const string& key, uint64_t value) {
+    int result = add(key, value);
+    assert(result);
+    if (result == 0) exit(1); // Disable unused variable warning. 
+  }
+};
+
+// Implement 'add' directly with the new Merge operation
+class MergeBasedCounters : public Counters {
+ private:
+  WriteOptions merge_option_; // for merge
+
+ public:
+  explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : Counters(db, defaultCount),
+        merge_option_() {
+  }
+
+  // mapped to a rocksdb Merge operation
+  virtual bool add(const string& key, uint64_t value) override {
+    char encoded[sizeof(uint64_t)];
+    EncodeFixed64(encoded, value);
+    Slice slice(encoded, sizeof(uint64_t));
+    auto s = db_->Merge(merge_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << endl;
+      return false;
+    }
+  }
+};
+
+namespace {
+void dumpDb(DB* db) {
+  auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    uint64_t value = DecodeFixed64(it->value().data());
+    cout << it->key().ToString() << ": "  << value << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+}
+
+void testCounters(Counters& counters, DB* db, bool test_compaction) {
+
+  FlushOptions o;
+  o.wait = true;
+
+  counters.assert_set("a", 1);
+
+  if (test_compaction) db->Flush(o);
+
+  assert(counters.assert_get("a") == 1);
+
+  counters.assert_remove("b");
+
+  // defaut value is 0 if non-existent
+  assert(counters.assert_get("b") == 0);
+
+  counters.assert_add("a", 2);
+
+  if (test_compaction) db->Flush(o);
+
+  // 1+2 = 3
+  assert(counters.assert_get("a")== 3);
+
+  dumpDb(db);
+
+  std::cout << "1\n";
+
+  // 1+...+49 = ?
+  uint64_t sum = 0;
+  for (int i = 1; i < 50; i++) {
+    counters.assert_add("b", i);
+    sum += i;
+  }
+  assert(counters.assert_get("b") == sum);
+
+  std::cout << "2\n";
+  dumpDb(db);
+
+  std::cout << "3\n";
+
+  if (test_compaction) {
+    db->Flush(o);
+
+    cout << "Compaction started ...\n";
+    db->CompactRange(nullptr, nullptr);
+    cout << "Compaction ended\n";
+
+    dumpDb(db);
+
+    assert(counters.assert_get("a")== 3);
+    assert(counters.assert_get("b") == sum);
+  }
+}
+
+void testSuccessiveMerge(
+    Counters& counters, int max_num_merges, int num_merges) {
+
+  counters.assert_remove("z");
+  uint64_t sum = 0;
+
+  for (int i = 1; i <= num_merges; ++i) {
+    resetNumMergeOperatorCalls();
+    counters.assert_add("z", i);
+    sum += i;
+
+    if (i % (max_num_merges + 1) == 0) {
+      assert(numMergeOperatorCalls == max_num_merges + 1);
+    } else {
+      assert(numMergeOperatorCalls == 0);
+    }
+
+    resetNumMergeOperatorCalls();
+    assert(counters.assert_get("z") == sum);
+    assert(numMergeOperatorCalls == i % (max_num_merges + 1));
+  }
+}
+
+void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
+                      int count) {
+  FlushOptions o;
+  o.wait = true;
+
+  // Test case 1: partial merge should be called when the number of merge
+  //              operands exceeds the threshold.
+  uint64_t tmp_sum = 0;
+  resetNumPartialMergeCalls();
+  for (int i = 1; i <= count; i++) {
+    counters->assert_add("b", i);
+    tmp_sum += i;
+  }
+  db->Flush(o);
+  db->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(tmp_sum, counters->assert_get("b"));
+  if (count > max_merge) {
+    // in this case, FullMerge should be called instead.
+    ASSERT_EQ(num_partial_merge_calls, 0);
+  } else {
+    // if count >= min_merge, then partial merge should be called once.
+    ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
+  }
+
+  // Test case 2: partial merge should not be called when a put is found.
+  resetNumPartialMergeCalls();
+  tmp_sum = 0;
+  db->Put(rocksdb::WriteOptions(), "c", "10");
+  for (int i = 1; i <= count; i++) {
+    counters->assert_add("c", i);
+    tmp_sum += i;
+  }
+  db->Flush(o);
+  db->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(tmp_sum, counters->assert_get("c"));
+  ASSERT_EQ(num_partial_merge_calls, 0);
+}
+
+void testSingleBatchSuccessiveMerge(
+    DB* db,
+    int max_num_merges,
+    int num_merges) {
+  assert(num_merges > max_num_merges);
+
+  Slice key("BatchSuccessiveMerge");
+  uint64_t merge_value = 1;
+  Slice merge_value_slice((char *)&merge_value, sizeof(merge_value));
+
+  // Create the batch
+  WriteBatch batch;
+  for (int i = 0; i < num_merges; ++i) {
+    batch.Merge(key, merge_value_slice);
+  }
+
+  // Apply to memtable and count the number of merges
+  resetNumMergeOperatorCalls();
+  {
+    Status s = db->Write(WriteOptions(), &batch);
+    assert(s.ok());
+  }
+  assert(numMergeOperatorCalls ==
+      num_merges - (num_merges % (max_num_merges + 1)));
+
+  // Get the value
+  resetNumMergeOperatorCalls();
+  string get_value_str;
+  {
+    Status s = db->Get(ReadOptions(), key, &get_value_str);
+    assert(s.ok());
+  }
+  assert(get_value_str.size() == sizeof(uint64_t));
+  uint64_t get_value = DecodeFixed64(&get_value_str[0]);
+  ASSERT_EQ(get_value, num_merges * merge_value);
+  ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
+}
+
+void runTest(int argc, const string& dbname, const bool use_ttl = false) {
+  auto db = OpenDb(dbname, use_ttl);
+
+  {
+    cout << "Test read-modify-write counters... \n";
+    Counters counters(db, 0);
+    testCounters(counters, db.get(), true);
+  }
+
+  bool compact = false;
+  if (argc > 1) {
+    compact = true;
+    cout << "Turn on Compaction\n";
+  }
+
+  {
+    cout << "Test merge-based counters... \n";
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), compact);
+  }
+
+  DestroyDB(dbname, Options());
+  db.reset();
+
+  {
+    cout << "Test merge in memtable... \n";
+    size_t max_merge = 5;
+    auto db = OpenDb(dbname, use_ttl, max_merge);
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), compact);
+    testSuccessiveMerge(counters, max_merge, max_merge * 2);
+    testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+    DestroyDB(dbname, Options());
+  }
+
+  {
+    cout << "Test Partial-Merge\n";
+    size_t max_merge = 100;
+    for (uint32_t min_merge = 5; min_merge < 25; min_merge += 5) {
+      for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
+        auto db = OpenDb(dbname, use_ttl, max_merge, min_merge);
+        MergeBasedCounters counters(db, 0);
+        testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
+        DestroyDB(dbname, Options());
+      }
+      {
+        auto db = OpenDb(dbname, use_ttl, max_merge, min_merge);
+        MergeBasedCounters counters(db, 0);
+        testPartialMerge(&counters, db.get(), max_merge, min_merge,
+                         min_merge * 10);
+        DestroyDB(dbname, Options());
+      }
+    }
+  }
+}
+}  // namespace
+
+int main(int argc, char *argv[]) {
+  //TODO: Make this test like a general rocksdb unit-test
+  runTest(argc, test::TmpDir() + "/merge_testdb");
+  runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database
+  printf("Passed all tests!\n");
+  return 0;
+}
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -0,0 +1,358 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include "/usr/include/valgrind/callgrind.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/memtablerep.h"
+#include "util/histogram.h"
+#include "util/stop_watch.h"
+#include "util/testharness.h"
+
+
+bool FLAGS_random_key = false;
+bool FLAGS_use_set_based_memetable = false;
+int FLAGS_total_keys = 100;
+int FLAGS_write_buffer_size = 1000000000;
+int FLAGS_max_write_buffer_number = 8;
+int FLAGS_min_write_buffer_number_to_merge = 7;
+
+// Path to the database on file system
+const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
+
+namespace rocksdb {
+
+std::shared_ptr<DB> OpenDb() {
+    DB* db;
+    Options options;
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+    if (FLAGS_use_set_based_memetable) {
+      auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
+      options.memtable_factory.reset(
+          NewHashSkipListRepFactory(prefix_extractor));
+    }
+
+    Status s = DB::Open(options, kDbName,  &db);
+    ASSERT_OK(s);
+    return std::shared_ptr<DB>(db);
+}
+
+class PerfContextTest { };
+
+TEST(PerfContextTest, SeekIntoDeletion) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    db->Put(write_options, key, value);
+  }
+
+  for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
+    std::string key = "k" + std::to_string(i);
+    db->Delete(write_options, key);
+  }
+
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_time;
+  for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value;
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    auto status = db->Get(read_options, key, &value);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    ASSERT_TRUE(status.IsNotFound());
+    hist_get.Add(perf_context.user_key_comparison_count);
+    hist_get_time.Add(elapsed_nanos);
+  }
+
+  std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
+            << "Get time: \n" << hist_get_time.ToString();
+
+  HistogramImpl hist_seek_to_first;
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+  perf_context.Reset();
+  StopWatchNano timer(Env::Default(), true);
+  iter->SeekToFirst();
+  hist_seek_to_first.Add(perf_context.user_key_comparison_count);
+  auto elapsed_nanos = timer.ElapsedNanos();
+
+  std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
+            << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
+            << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
+            << "elapsed: " << elapsed_nanos << "\n";
+
+  HistogramImpl hist_seek;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    std::string key = "k" + std::to_string(i);
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->Seek(key);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    hist_seek.Add(perf_context.user_key_comparison_count);
+    std::cout << "seek cmp: " << perf_context.user_key_comparison_count
+              << " ikey skipped " << perf_context.internal_key_skipped_count
+              << " idelete skipped " << perf_context.internal_delete_skipped_count
+              << " elapsed: " << elapsed_nanos << "ns\n";
+
+    perf_context.Reset();
+    ASSERT_TRUE(iter->Valid());
+    StopWatchNano timer2(Env::Default(), true);
+    iter->Next();
+    auto elapsed_nanos2 = timer2.ElapsedNanos();
+    std::cout << "next cmp: " << perf_context.user_key_comparison_count
+              << "elapsed: " << elapsed_nanos2 << "ns\n";
+  }
+
+  std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
+}
+
+TEST(PerfContextTest, StopWatchNanoOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatchNano timer(Env::Default(), true);
+  for (auto& timing : timings) {
+    timing = timer.ElapsedNanos(true /* reset */);
+  }
+
+  HistogramImpl histogram;
+  for (const auto timing : timings) {
+    histogram.Add(timing);
+  }
+
+  std::cout << histogram.ToString();
+}
+
+TEST(PerfContextTest, StopWatchOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatch timer(Env::Default());
+  for (auto& timing : timings) {
+    timing = timer.ElapsedMicros();
+  }
+
+  HistogramImpl histogram;
+  uint64_t prev_timing = 0;
+  for (const auto timing : timings) {
+    histogram.Add(timing - prev_timing);
+    prev_timing = timing;
+  }
+
+  std::cout << histogram.ToString();
+}
+
+void ProfileKeyComparison() {
+  DestroyDB(kDbName, Options());    // Start this test with a fresh DB
+
+  auto db = OpenDb();
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  HistogramImpl hist_put;
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_snapshot;
+  HistogramImpl hist_get_memtable;
+  HistogramImpl hist_get_post_process;
+  HistogramImpl hist_num_memtable_checked;
+  HistogramImpl hist_write_pre_post;
+  HistogramImpl hist_write_wal_time;
+  HistogramImpl hist_write_memtable_time;
+
+  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    std::random_shuffle(keys.begin(), keys.end());
+  }
+
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    perf_context.Reset();
+    db->Put(write_options, key, value);
+    hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
+    hist_write_wal_time.Add(perf_context.write_wal_time);
+    hist_write_memtable_time.Add(perf_context.write_memtable_time);
+    hist_put.Add(perf_context.user_key_comparison_count);
+
+    perf_context.Reset();
+    db->Get(read_options, key, &value);
+    hist_get_snapshot.Add(perf_context.get_snapshot_time);
+    hist_get_memtable.Add(perf_context.get_from_memtable_time);
+    hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_get_post_process.Add(perf_context.get_post_process_time);
+    hist_get.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
+            << "Get uesr key comparison: \n" << hist_get.ToString();
+  std::cout << "Put(): Pre and Post Process Time: \n"
+            << hist_write_pre_post.ToString()
+            << " Writing WAL time: \n"
+            << hist_write_wal_time.ToString() << "\n"
+            << " Writing Mem Table time: \n"
+            << hist_write_memtable_time.ToString() << "\n";
+
+  std::cout << "Get(): Time to get snapshot: \n"
+            << hist_get_snapshot.ToString()
+            << " Time to get value from memtables: \n"
+            << hist_get_memtable.ToString() << "\n"
+            << " Number of memtables checked: \n"
+            << hist_num_memtable_checked.ToString() << "\n"
+            << " Time to post process: \n"
+            << hist_get_post_process.ToString() << "\n";
+}
+
+TEST(PerfContextTest, KeyComparisonCount) {
+  SetPerfLevel(kEnableCount);
+  ProfileKeyComparison();
+
+  SetPerfLevel(kDisable);
+  ProfileKeyComparison();
+
+  SetPerfLevel(kEnableTime);
+  ProfileKeyComparison();
+}
+
+// make perf_context_test
+// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
+// For one memtable:
+// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
+// For two memtables:
+// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
+// Specify --random_key=1 to shuffle the key before insertion
+// Results show that, for sequential insertion, worst-case Seek Key comparison
+// is close to the total number of keys (linear), when there is only one
+// memtable. When there are two memtables, even the avg Seek Key comparison
+// starts to become linear to the input size.
+
+TEST(PerfContextTest, SeekKeyComparison) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    std::random_shuffle(keys.begin(), keys.end());
+  }
+
+  HistogramImpl hist_put_time;
+  HistogramImpl hist_wal_time;
+  HistogramImpl hist_time_diff;
+
+  SetPerfLevel(kEnableTime);
+  StopWatchNano timer(Env::Default());
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    perf_context.Reset();
+    timer.Start();
+    db->Put(write_options, key, value);
+    auto put_time = timer.ElapsedNanos();
+    hist_put_time.Add(put_time);
+    hist_wal_time.Add(perf_context.write_wal_time);
+    hist_time_diff.Add(put_time - perf_context.write_wal_time);
+  }
+
+  std::cout << "Put time:\n" << hist_put_time.ToString()
+            << "WAL time:\n" << hist_wal_time.ToString()
+            << "time diff:\n" << hist_time_diff.ToString();
+
+  HistogramImpl hist_seek;
+  HistogramImpl hist_next;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    perf_context.Reset();
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->value().ToString(), value);
+    hist_seek.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  for (iter->SeekToFirst(); iter->Valid();) {
+    perf_context.Reset();
+    iter->Next();
+    hist_next.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::cout << "Seek:\n" << hist_seek.ToString()
+            << "Next:\n" << hist_next.ToString();
+}
+
+}
+
+int main(int argc, char** argv) {
+
+  for (int i = 1; i < argc; i++) {
+    int n;
+    char junk;
+
+    if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+      FLAGS_write_buffer_size = n;
+    }
+
+    if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
+      FLAGS_total_keys = n;
+    }
+
+    if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_random_key = n;
+    }
+
+    if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_use_set_based_memetable = n;
+    }
+
+  }
+
+  std::cout << kDbName << "\n";
+
+  rocksdb::test::RunAllTests();
+  return 0;
+}
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -0,0 +1,853 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <algorithm>
+#include <set>
+
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "table/plain_table_reader.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace rocksdb {
+
+class PlainTableDBTest {
+ protected:
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  Options last_options_;
+
+ public:
+  PlainTableDBTest() : env_(Env::Default()) {
+    dbname_ = test::TmpDir() + "/plain_table_db_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~PlainTableDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(NewPlainTableFactory(16, 2, 0.8, 3));
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.allow_mmap_reads = true;
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST(PlainTableDBTest, Empty) {
+  ASSERT_TRUE(dbfull() != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+class TestPlainTableReader : public PlainTableReader {
+ public:
+  TestPlainTableReader(const EnvOptions& storage_options,
+                       const InternalKeyComparator& icomparator,
+                       uint64_t file_size, int bloom_bits_per_key,
+                       double hash_table_ratio, size_t index_sparseness,
+                       const TableProperties* table_properties,
+                       unique_ptr<RandomAccessFile>&& file,
+                       const Options& options, bool* expect_bloom_not_match)
+      : PlainTableReader(options, std::move(file), storage_options, icomparator,
+                         file_size, bloom_bits_per_key, hash_table_ratio,
+                         index_sparseness, table_properties, 2 * 1024 * 1024),
+        expect_bloom_not_match_(expect_bloom_not_match) {
+    Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
+    ASSERT_TRUE(s.ok());
+  }
+
+  virtual ~TestPlainTableReader() {}
+
+ private:
+  virtual bool MatchBloom(uint32_t hash) const override {
+    bool ret = PlainTableReader::MatchBloom(hash);
+    ASSERT_TRUE(!*expect_bloom_not_match_ || !ret);
+    return ret;
+  }
+  bool* expect_bloom_not_match_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+class TestPlainTableFactory : public PlainTableFactory {
+ public:
+  explicit TestPlainTableFactory(bool* expect_bloom_not_match,
+                                 uint32_t user_key_len, int bloom_bits_per_key,
+                                 double hash_table_ratio,
+                                 size_t index_sparseness,
+                                 size_t huge_page_tlb_size)
+      : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio,
+                          index_sparseness, huge_page_tlb_size),
+        bloom_bits_per_key_(bloom_bits_per_key),
+        hash_table_ratio_(hash_table_ratio),
+        index_sparseness_(index_sparseness),
+        expect_bloom_not_match_(expect_bloom_not_match) {}
+
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table) const override {
+    TableProperties* props = nullptr;
+    auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                                 options.env, options.info_log.get(), &props);
+    ASSERT_TRUE(s.ok());
+
+    std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
+        soptions, internal_comparator, file_size, bloom_bits_per_key_,
+        hash_table_ratio_, index_sparseness_, props, std::move(file), options,
+        expect_bloom_not_match_));
+
+    *table = std::move(new_reader);
+    return s;
+  }
+
+ private:
+  int bloom_bits_per_key_;
+  double hash_table_ratio_;
+  size_t index_sparseness_;
+  bool* expect_bloom_not_match_;
+};
+
+TEST(PlainTableDBTest, Flush) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
+        if (total_order) {
+          options.table_factory.reset(NewTotalOrderPlainTableFactory(
+              16, bloom_bits, 2, huge_page_tlb_size));
+        } else {
+          options.table_factory.reset(NewPlainTableFactory(
+              16, bloom_bits, 0.75, 16, huge_page_tlb_size));
+        }
+        DestroyAndReopen(&options);
+
+        ASSERT_OK(Put("1000000000000foo", "v1"));
+        ASSERT_OK(Put("0000000000000bar", "v2"));
+        ASSERT_OK(Put("1000000000000foo", "v3"));
+        dbfull()->TEST_FlushMemTable();
+
+        TablePropertiesCollection ptc;
+        reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+        ASSERT_EQ(1U, ptc.size());
+        auto row = ptc.begin();
+        auto tp = row->second;
+        ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
+                                                "plain_table_hash_table_size"));
+        ASSERT_EQ(total_order ? "9" : "0", (tp->user_collected_properties).at(
+                                               "plain_table_sub_index_size"));
+
+        ASSERT_EQ("v3", Get("1000000000000foo"));
+        ASSERT_EQ("v2", Get("0000000000000bar"));
+      }
+    }
+  }
+}
+
+TEST(PlainTableDBTest, Flush2) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        bool expect_bloom_not_match = false;
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
+        if (total_order) {
+          options.prefix_extractor = nullptr;
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0, 2, huge_page_tlb_size));
+        } else {
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0.75, 16, huge_page_tlb_size));
+        }
+        DestroyAndReopen(&options);
+        ASSERT_OK(Put("0000000000000bar", "b"));
+        ASSERT_OK(Put("1000000000000foo", "v1"));
+        dbfull()->TEST_FlushMemTable();
+
+        ASSERT_OK(Put("1000000000000foo", "v2"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v2", Get("1000000000000foo"));
+
+        ASSERT_OK(Put("0000000000000eee", "v3"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v3", Get("0000000000000eee"));
+
+        ASSERT_OK(Delete("0000000000000bar"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+        ASSERT_OK(Put("0000000000000eee", "v5"));
+        ASSERT_OK(Put("9000000000000eee", "v5"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v5", Get("0000000000000eee"));
+
+        // Test Bloom Filter
+        if (bloom_bits > 0) {
+          // Neither key nor value should exist.
+          expect_bloom_not_match = true;
+          ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
+
+          // Key doesn't exist any more but prefix exists.
+          if (total_order) {
+            ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
+            ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+          }
+          expect_bloom_not_match = false;
+        }
+      }
+    }
+  }
+}
+
+TEST(PlainTableDBTest, Iterator) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+      for (int total_order = 0; total_order <= 1; total_order++) {
+        bool expect_bloom_not_match = false;
+        Options options = CurrentOptions();
+        options.create_if_missing = true;
+        // Set only one bucket to force bucket conflict.
+        // Test index interval for the same prefix to be 1, 2 and 4
+        if (total_order) {
+          options.prefix_extractor = nullptr;
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0, 2, huge_page_tlb_size));
+        } else {
+          options.table_factory.reset(
+              new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits,
+                                        0.75, 16, huge_page_tlb_size));
+        }
+        DestroyAndReopen(&options);
+
+        ASSERT_OK(Put("1000000000foo002", "v_2"));
+        ASSERT_OK(Put("0000000000000bar", "random"));
+        ASSERT_OK(Put("1000000000foo001", "v1"));
+        ASSERT_OK(Put("3000000000000bar", "bar_v"));
+        ASSERT_OK(Put("1000000000foo003", "v__3"));
+        ASSERT_OK(Put("1000000000foo004", "v__4"));
+        ASSERT_OK(Put("1000000000foo005", "v__5"));
+        ASSERT_OK(Put("1000000000foo007", "v__7"));
+        ASSERT_OK(Put("1000000000foo008", "v__8"));
+        dbfull()->TEST_FlushMemTable();
+        ASSERT_EQ("v1", Get("1000000000foo001"));
+        ASSERT_EQ("v__3", Get("1000000000foo003"));
+        Iterator* iter = dbfull()->NewIterator(ReadOptions());
+        iter->Seek("1000000000foo000");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo001", iter->key().ToString());
+        ASSERT_EQ("v1", iter->value().ToString());
+
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo002", iter->key().ToString());
+        ASSERT_EQ("v_2", iter->value().ToString());
+
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo003", iter->key().ToString());
+        ASSERT_EQ("v__3", iter->value().ToString());
+
+        iter->Next();
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo004", iter->key().ToString());
+        ASSERT_EQ("v__4", iter->value().ToString());
+
+        iter->Seek("3000000000000bar");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("3000000000000bar", iter->key().ToString());
+        ASSERT_EQ("bar_v", iter->value().ToString());
+
+        iter->Seek("1000000000foo000");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo001", iter->key().ToString());
+        ASSERT_EQ("v1", iter->value().ToString());
+
+        iter->Seek("1000000000foo005");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo005", iter->key().ToString());
+        ASSERT_EQ("v__5", iter->value().ToString());
+
+        iter->Seek("1000000000foo006");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo007", iter->key().ToString());
+        ASSERT_EQ("v__7", iter->value().ToString());
+
+        iter->Seek("1000000000foo008");
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ("1000000000foo008", iter->key().ToString());
+        ASSERT_EQ("v__8", iter->value().ToString());
+
+        if (total_order == 0) {
+          iter->Seek("1000000000foo009");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("3000000000000bar", iter->key().ToString());
+        }
+
+        // Test Bloom Filter
+        if (bloom_bits > 0) {
+          if (!total_order) {
+            // Neither key nor value should exist.
+            expect_bloom_not_match = true;
+            iter->Seek("2not000000000bar");
+            ASSERT_TRUE(!iter->Valid());
+            ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+            expect_bloom_not_match = false;
+          } else {
+            expect_bloom_not_match = true;
+            ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+            expect_bloom_not_match = false;
+          }
+        }
+
+        delete iter;
+      }
+    }
+  }
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
+}
+}  // namespace
+
+TEST(PlainTableDBTest, IteratorLargeKeys) {
+  Options options = CurrentOptions();
+  options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16));
+  options.create_if_missing = true;
+  options.prefix_extractor.reset();
+  DestroyAndReopen(&options);
+
+  std::string key_list[] = {
+      MakeLongKey(30, '0'),
+      MakeLongKey(16, '1'),
+      MakeLongKey(32, '2'),
+      MakeLongKey(60, '3'),
+      MakeLongKey(90, '4'),
+      MakeLongKey(50, '5'),
+      MakeLongKey(26, '6')
+  };
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+  }
+
+  dbfull()->TEST_FlushMemTable();
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek(key_list[0]);
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(key_list[i], iter->key().ToString());
+    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    iter->Next();
+  }
+
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+// A test comparator which compare two strings in this way:
+// (1) first compare prefix of 8 bytes in alphabet order,
+// (2) if two strings share the same prefix, sort the other part of the string
+//     in the reverse alphabet order.
+class SimpleSuffixReverseComparator : public Comparator {
+ public:
+  SimpleSuffixReverseComparator() {}
+
+  virtual const char* Name() const { return "SimpleSuffixReverseComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    Slice prefix_a = Slice(a.data(), 8);
+    Slice prefix_b = Slice(b.data(), 8);
+    int prefix_comp = prefix_a.compare(prefix_b);
+    if (prefix_comp != 0) {
+      return prefix_comp;
+    } else {
+      Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
+      Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
+      return -(suffix_a.compare(suffix_b));
+    }
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const {}
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+};
+
+TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // Set only one bucket to force bucket conflict.
+  // Test index interval for the same prefix to be 1, 2 and 4
+  SimpleSuffixReverseComparator comp;
+  options.comparator = &comp;
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put("1000000000foo002", "v_2"));
+  ASSERT_OK(Put("0000000000000bar", "random"));
+  ASSERT_OK(Put("1000000000foo001", "v1"));
+  ASSERT_OK(Put("3000000000000bar", "bar_v"));
+  ASSERT_OK(Put("1000000000foo003", "v__3"));
+  ASSERT_OK(Put("1000000000foo004", "v__4"));
+  ASSERT_OK(Put("1000000000foo005", "v__5"));
+  ASSERT_OK(Put("1000000000foo007", "v__7"));
+  ASSERT_OK(Put("1000000000foo008", "v__8"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v1", Get("1000000000foo001"));
+  ASSERT_EQ("v__3", Get("1000000000foo003"));
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek("1000000000foo009");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo007", iter->key().ToString());
+  ASSERT_EQ("v__7", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo004", iter->key().ToString());
+  ASSERT_EQ("v__4", iter->value().ToString());
+
+  iter->Seek("3000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+  ASSERT_EQ("bar_v", iter->value().ToString());
+
+  iter->Seek("1000000000foo005");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo006");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo008");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Seek("1000000000foo000");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST(PlainTableDBTest, HashBucketConflict) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+      options.table_factory.reset(
+          NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
+
+      dbfull()->TEST_FlushMemTable();
+
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
+
+      iter->Seek("5000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+      iter->Seek("2000000000000fo8");
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.comparator->Compare(iter->key(), "20000001") > 0);
+
+      iter->Seek("5000000000000fo8");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      delete iter;
+    }
+  }
+}
+
+TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      SimpleSuffixReverseComparator comp;
+      options.comparator = &comp;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+      options.table_factory.reset(
+          NewTotalOrderPlainTableFactory(16, 0, 2 ^ i, huge_page_tlb_size));
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
+
+      dbfull()->TEST_FlushMemTable();
+
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+
+      iter->Seek("5000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo2", iter->key().ToString());
+
+      std::string seek_key = "2000000000000bar";
+      iter->Seek(seek_key);
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.prefix_extractor->Transform(iter->key()) !=
+                      options.prefix_extractor->Transform(seek_key));
+
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      delete iter;
+    }
+  }
+}
+
+TEST(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // Set only one bucket to force bucket conflict.
+  // Test index interval for the same prefix to be 1, 2 and 4
+  options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 5));
+  DestroyAndReopen(&options);
+  ASSERT_OK(Put("5000000000000fo0", "v1"));
+  ASSERT_OK(Put("5000000000000fo1", "v2"));
+  ASSERT_OK(Put("5000000000000fo2", "v3"));
+
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_EQ("v1", Get("5000000000000fo0"));
+  ASSERT_EQ("v2", Get("5000000000000fo1"));
+  ASSERT_EQ("v3", Get("5000000000000fo2"));
+
+  ASSERT_EQ("NOT_FOUND", Get("8000000000000bar"));
+  ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+
+  iter->Seek("5000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+  iter->Seek("5000000000000fo8");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("1000000000000fo2");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("8000000000000fo2");
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+TEST(PlainTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+      num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -0,0 +1,499 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/memtablerep.h"
+#include "util/histogram.h"
+#include "util/stop_watch.h"
+#include "util/testharness.h"
+
+using GFLAGS::ParseCommandLineFlags;
+
+DEFINE_bool(trigger_deadlock, false,
+            "issue delete in range scan to trigger PrefixHashMap deadlock");
+DEFINE_uint64(bucket_count, 100000, "number of buckets");
+DEFINE_uint64(num_locks, 10001, "number of locks");
+DEFINE_bool(random_prefix, false, "randomize prefix");
+DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 33554432, "");
+DEFINE_int64(max_write_buffer_number, 2, "");
+DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(skiplist_height, 4, "");
+DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
+DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
+DEFINE_int32(memtable_prefix_bloom_huge_page_tlb_size, 2 * 1024 * 1024, "");
+DEFINE_int32(value_size, 40, "");
+
+// Path to the database on file system
+const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
+
+namespace rocksdb {
+
+struct TestKey {
+  uint64_t prefix;
+  uint64_t sorted;
+
+  TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
+};
+
+// return a slice backed by test_key
+inline Slice TestKeyToSlice(const TestKey& test_key) {
+  return Slice((const char*)&test_key, sizeof(test_key));
+}
+
+inline const TestKey* SliceToTestKey(const Slice& slice) {
+  return (const TestKey*)slice.data();
+}
+
+class TestKeyComparator : public Comparator {
+ public:
+
+  // Compare needs to be aware of the possibility of a and/or b is
+  // prefix only
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    const TestKey* key_a = SliceToTestKey(a);
+    const TestKey* key_b = SliceToTestKey(b);
+    if (key_a->prefix != key_b->prefix) {
+      if (key_a->prefix < key_b->prefix) return -1;
+      if (key_a->prefix > key_b->prefix) return 1;
+    } else {
+      ASSERT_TRUE(key_a->prefix == key_b->prefix);
+      // note, both a and b could be prefix only
+      if (a.size() != b.size()) {
+        // one of them is prefix
+        ASSERT_TRUE(
+          (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
+          (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
+        if (a.size() < b.size()) return -1;
+        if (a.size() > b.size()) return 1;
+      } else {
+        // both a and b are prefix
+        if (a.size() == sizeof(uint64_t)) {
+          return 0;
+        }
+
+        // both a and b are whole key
+        ASSERT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
+        if (key_a->sorted < key_b->sorted) return -1;
+        if (key_a->sorted > key_b->sorted) return 1;
+        if (key_a->sorted == key_b->sorted) return 0;
+      }
+    }
+    return 0;
+  }
+
+  virtual const char* Name() const override {
+    return "TestKeyComparator";
+  }
+
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+  }
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+
+};
+
+namespace {
+void PutKey(DB* db, WriteOptions write_options, uint64_t prefix,
+            uint64_t suffix, const Slice& value) {
+  TestKey test_key(prefix, suffix);
+  Slice key = TestKeyToSlice(test_key);
+  ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) {
+  TestKey test_key(prefix, suffix);
+  Slice key = TestKeyToSlice(test_key);
+  iter->Seek(key);
+}
+
+const std::string kNotFoundResult = "NOT_FOUND";
+
+std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix,
+                uint64_t suffix) {
+  TestKey test_key(prefix, suffix);
+  Slice key = TestKeyToSlice(test_key);
+
+  std::string result;
+  Status s = db->Get(read_options, key, &result);
+  if (s.IsNotFound()) {
+    result = kNotFoundResult;
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+}  // namespace
+
+class PrefixTest {
+ public:
+  std::shared_ptr<DB> OpenDb() {
+    DB* db;
+
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+    options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
+    options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
+    options.memtable_prefix_bloom_huge_page_tlb_size =
+        FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
+
+    Status s = DB::Open(options, kDbName,  &db);
+    ASSERT_OK(s);
+    return std::shared_ptr<DB>(db);
+  }
+
+  void FirstOption() {
+    option_config_ = kBegin;
+  }
+
+  bool NextOptions(int bucket_count) {
+    // skip some options
+    option_config_++;
+    if (option_config_ < kEnd) {
+      options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+      switch(option_config_) {
+        case kHashSkipList:
+          options.memtable_factory.reset(
+              NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height));
+          return true;
+        case kHashLinkList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count));
+          return true;
+        case kHashLinkListHugePageTlb:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
+          return true;
+        default:
+          return false;
+      }
+    }
+    return false;
+  }
+
+  PrefixTest() : option_config_(kBegin) {
+    options.comparator = new TestKeyComparator();
+  }
+  ~PrefixTest() {
+    delete options.comparator;
+  }
+ protected:
+  enum OptionConfig {
+    kBegin,
+    kHashSkipList,
+    kHashLinkList,
+    kHashLinkListHugePageTlb,
+    kEnd
+  };
+  int option_config_;
+  Options options;
+};
+
+TEST(PrefixTest, TestResult) {
+  for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+    FirstOption();
+    while (NextOptions(num_buckets)) {
+      std::cout << "*** Mem table: " << options.memtable_factory->Name()
+                << " number of buckets: " << num_buckets
+                << std::endl;
+      DestroyDB(kDbName, Options());
+      auto db = OpenDb();
+      WriteOptions write_options;
+      ReadOptions read_options;
+
+      // 1. Insert one row.
+      Slice v16("v16");
+      PutKey(db.get(), write_options, 1, 6, v16);
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(!iter->Valid());
+
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6));
+
+      // 2. Insert an entry for the same prefix as the last entry in the bucket.
+      Slice v17("v17");
+      PutKey(db.get(), write_options, 1, 7, v17);
+      iter.reset(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(!iter->Valid());
+
+      // 3. Insert an entry for the same prefix as the head of the bucket.
+      Slice v15("v15");
+      PutKey(db.get(), write_options, 1, 5, v15);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+
+      // 4. Insert an entry with a larger prefix
+      Slice v22("v22");
+      PutKey(db.get(), write_options, 2, 2, v22);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 2, 2);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      // 5. Insert an entry with a smaller prefix
+      Slice v02("v02");
+      PutKey(db.get(), write_options, 0, 2, v02);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 0, 2);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+      SeekIterator(iter.get(), 0, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      // 6. Insert to the beginning and the end of the first prefix
+      Slice v13("v13");
+      Slice v18("v18");
+      PutKey(db.get(), write_options, 1, 3, v13);
+      PutKey(db.get(), write_options, 1, 8, v18);
+      iter.reset(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 3);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v13 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v18 == iter->value());
+
+      SeekIterator(iter.get(), 0, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2));
+      ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2));
+      ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3));
+      ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+      ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8));
+    }
+  }
+}
+
+TEST(PrefixTest, DynamicPrefixIterator) {
+  while (NextOptions(FLAGS_bucket_count)) {
+    std::cout << "*** Mem table: " << options.memtable_factory->Name()
+        << std::endl;
+    DestroyDB(kDbName, Options());
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+
+    std::vector<uint64_t> prefixes;
+    for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+      prefixes.push_back(i);
+    }
+
+    if (FLAGS_random_prefix) {
+      std::random_shuffle(prefixes.begin(), prefixes.end());
+    }
+
+    HistogramImpl hist_put_time;
+    HistogramImpl hist_put_comparison;
+
+    // insert x random prefix, each with y continuous element.
+    for (auto prefix : prefixes) {
+       for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+        TestKey test_key(prefix, sorted);
+
+        Slice key = TestKeyToSlice(test_key);
+        std::string value(FLAGS_value_size, 0);
+
+        perf_context.Reset();
+        StopWatchNano timer(Env::Default(), true);
+        ASSERT_OK(db->Put(write_options, key, value));
+        hist_put_time.Add(timer.ElapsedNanos());
+        hist_put_comparison.Add(perf_context.user_key_comparison_count);
+      }
+    }
+
+    std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+              << "Put time: \n" << hist_put_time.ToString();
+
+    // test seek existing keys
+    HistogramImpl hist_seek_time;
+    HistogramImpl hist_seek_comparison;
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    for (auto prefix : prefixes) {
+      TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
+      Slice key = TestKeyToSlice(test_key);
+      std::string value = "v" + std::to_string(0);
+
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      auto key_prefix = options.prefix_extractor->Transform(key);
+      uint64_t total_keys = 0;
+      for (iter->Seek(key);
+           iter->Valid() && iter->key().starts_with(key_prefix);
+           iter->Next()) {
+        if (FLAGS_trigger_deadlock) {
+          std::cout << "Behold the deadlock!\n";
+          db->Delete(write_options, iter->key());
+        }
+        total_keys++;
+      }
+      hist_seek_time.Add(timer.ElapsedNanos());
+      hist_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
+    }
+
+    std::cout << "Seek key comparison: \n"
+              << hist_seek_comparison.ToString()
+              << "Seek time: \n"
+              << hist_seek_time.ToString();
+
+    // test non-existing keys
+    HistogramImpl hist_no_seek_time;
+    HistogramImpl hist_no_seek_comparison;
+
+    for (auto prefix = FLAGS_total_prefixes;
+         prefix < FLAGS_total_prefixes + 10000;
+         prefix++) {
+      TestKey test_key(prefix, 0);
+      Slice key = TestKeyToSlice(test_key);
+
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      iter->Seek(key);
+      hist_no_seek_time.Add(timer.ElapsedNanos());
+      hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
+      ASSERT_TRUE(!iter->Valid());
+    }
+
+    std::cout << "non-existing Seek key comparison: \n"
+              << hist_no_seek_comparison.ToString()
+              << "non-existing Seek time: \n"
+              << hist_no_seek_time.ToString();
+  }
+}
+
+}
+
+int main(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+  std::cout << kDbName << "\n";
+
+  rocksdb::test::RunAllTests();
+  return 0;
+}
+
+#endif  // GFLAGS
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -0,0 +1,403 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// We recover the contents of the descriptor from the other files we find.
+// (1) Any log files are first converted to tables
+// (2) We scan every table to compute
+//     (a) smallest/largest for the table
+//     (b) largest sequence number in the table
+// (3) We generate descriptor contents:
+//      - log number is set to zero
+//      - next-file-number is set to 1 + largest file number we found
+//      - last-sequence-number is set to largest sequence# found across
+//        all tables (see 2c)
+//      - compaction pointers are cleared
+//      - every table file is added at level 0
+//
+// Possible optimization 1:
+//   (a) Compute total size and use to pick appropriate max-level M
+//   (b) Sort tables by largest sequence# in the table
+//   (c) For each table: if it overlaps earlier table, place in level-0,
+//       else place in level-M.
+// Possible optimization 2:
+//   Store per-table metadata (smallest, largest, largest-seq#, ...)
+//   in the table's meta section to speed up ScanTable.
+
+#ifndef ROCKSDB_LITE
+
+#include "db/builder.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+namespace {
+
+class Repairer {
+ public:
+  Repairer(const std::string& dbname, const Options& options)
+      : dbname_(dbname),
+        env_(options.env),
+        icmp_(options.comparator),
+        ipolicy_(options.filter_policy),
+        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
+        raw_table_cache_(
+            // TableCache can be small since we expect each table to be opened
+            // once.
+            NewLRUCache(10, options_.table_cache_numshardbits,
+                        options_.table_cache_remove_scan_count_limit)),
+        next_file_number_(1) {
+    table_cache_ = new TableCache(dbname_, &options_, storage_options_,
+                                  raw_table_cache_.get());
+    edit_ = new VersionEdit();
+  }
+
+  ~Repairer() {
+    delete table_cache_;
+    raw_table_cache_.reset();
+    delete edit_;
+  }
+
+  Status Run() {
+    Status status = FindFiles();
+    if (status.ok()) {
+      ConvertLogFilesToTables();
+      ExtractMetaData();
+      status = WriteDescriptor();
+    }
+    if (status.ok()) {
+      unsigned long long bytes = 0;
+      for (size_t i = 0; i < tables_.size(); i++) {
+        bytes += tables_[i].meta.file_size;
+      }
+      Log(options_.info_log,
+          "**** Repaired rocksdb %s; "
+          "recovered %d files; %llu bytes. "
+          "Some data may have been lost. "
+          "****",
+          dbname_.c_str(),
+          static_cast<int>(tables_.size()),
+          bytes);
+    }
+    return status;
+  }
+
+ private:
+  struct TableInfo {
+    FileMetaData meta;
+    SequenceNumber min_sequence;
+    SequenceNumber max_sequence;
+  };
+
+  std::string const dbname_;
+  Env* const env_;
+  InternalKeyComparator const icmp_;
+  InternalFilterPolicy const ipolicy_;
+  Options const options_;
+  std::shared_ptr<Cache> raw_table_cache_;
+  TableCache* table_cache_;
+  VersionEdit* edit_;
+
+  std::vector<std::string> manifests_;
+  std::vector<uint64_t> table_numbers_;
+  std::vector<uint64_t> logs_;
+  std::vector<TableInfo> tables_;
+  uint64_t next_file_number_;
+  const EnvOptions storage_options_;
+
+  Status FindFiles() {
+    std::vector<std::string> filenames;
+    Status status = env_->GetChildren(dbname_, &filenames);
+    if (!status.ok()) {
+      return status;
+    }
+    if (filenames.empty()) {
+      return Status::Corruption(dbname_, "repair found no files");
+    }
+
+    uint64_t number;
+    FileType type;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type)) {
+        if (type == kDescriptorFile) {
+          manifests_.push_back(filenames[i]);
+        } else {
+          if (number + 1 > next_file_number_) {
+            next_file_number_ = number + 1;
+          }
+          if (type == kLogFile) {
+            logs_.push_back(number);
+          } else if (type == kTableFile) {
+            table_numbers_.push_back(number);
+          } else {
+            // Ignore other files
+          }
+        }
+      }
+    }
+    return status;
+  }
+
+  void ConvertLogFilesToTables() {
+    for (size_t i = 0; i < logs_.size(); i++) {
+      std::string logname = LogFileName(dbname_, logs_[i]);
+      Status status = ConvertLogToTable(logs_[i]);
+      if (!status.ok()) {
+        Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
+            (unsigned long long) logs_[i],
+            status.ToString().c_str());
+      }
+      ArchiveFile(logname);
+    }
+  }
+
+  Status ConvertLogToTable(uint64_t log) {
+    struct LogReporter : public log::Reader::Reporter {
+      Env* env;
+      std::shared_ptr<Logger> info_log;
+      uint64_t lognum;
+      virtual void Corruption(size_t bytes, const Status& s) {
+        // We print error messages for corruption, but continue repairing.
+        Log(info_log, "Log #%llu: dropping %d bytes; %s",
+            (unsigned long long) lognum,
+            static_cast<int>(bytes),
+            s.ToString().c_str());
+      }
+    };
+
+    // Open the log file
+    std::string logname = LogFileName(dbname_, log);
+    unique_ptr<SequentialFile> lfile;
+    Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = options_.info_log;
+    reporter.lognum = log;
+    // We intentially make log::Reader do checksumming so that
+    // corruptions cause entire commits to be skipped instead of
+    // propagating bad information (like overly large sequence
+    // numbers).
+    log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/,
+                       0/*initial_offset*/);
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    MemTable* mem = new MemTable(icmp_, options_);
+    auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
+    mem->Ref();
+    int counter = 0;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter.Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
+      status = WriteBatchInternal::InsertInto(&batch, cf_mems_default);
+      if (status.ok()) {
+        counter += WriteBatchInternal::Count(&batch);
+      } else {
+        Log(options_.info_log, "Log #%llu: ignoring %s",
+            (unsigned long long) log,
+            status.ToString().c_str());
+        status = Status::OK();  // Keep going with rest of file
+      }
+    }
+
+    // Do not record a version edit for this conversion to a Table
+    // since ExtractMetaData() will also generate edits.
+    FileMetaData meta;
+    meta.number = next_file_number_++;
+    ReadOptions ro;
+    Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */);
+    status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
+                        iter, &meta, icmp_, 0, 0, kNoCompression);
+    delete iter;
+    delete mem->Unref();
+    delete cf_mems_default;
+    mem = nullptr;
+    if (status.ok()) {
+      if (meta.file_size > 0) {
+        table_numbers_.push_back(meta.number);
+      }
+    }
+    Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
+        (unsigned long long) log,
+        counter,
+        (unsigned long long) meta.number,
+        status.ToString().c_str());
+    return status;
+  }
+
+  void ExtractMetaData() {
+    for (size_t i = 0; i < table_numbers_.size(); i++) {
+      TableInfo t;
+      t.meta.number = table_numbers_[i];
+      Status status = ScanTable(&t);
+      if (!status.ok()) {
+        std::string fname = TableFileName(dbname_, table_numbers_[i]);
+        Log(options_.info_log, "Table #%llu: ignoring %s",
+            (unsigned long long) table_numbers_[i],
+            status.ToString().c_str());
+        ArchiveFile(fname);
+      } else {
+        tables_.push_back(t);
+      }
+    }
+  }
+
+  Status ScanTable(TableInfo* t) {
+    std::string fname = TableFileName(dbname_, t->meta.number);
+    int counter = 0;
+    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    if (status.ok()) {
+      FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
+      Iterator* iter = table_cache_->NewIterator(
+          ReadOptions(), storage_options_, icmp_, dummy_meta);
+      bool empty = true;
+      ParsedInternalKey parsed;
+      t->min_sequence = 0;
+      t->max_sequence = 0;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (!ParseInternalKey(key, &parsed)) {
+          Log(options_.info_log, "Table #%llu: unparsable key %s",
+              (unsigned long long) t->meta.number,
+              EscapeString(key).c_str());
+          continue;
+        }
+
+        counter++;
+        if (empty) {
+          empty = false;
+          t->meta.smallest.DecodeFrom(key);
+        }
+        t->meta.largest.DecodeFrom(key);
+        if (parsed.sequence < t->min_sequence) {
+          t->min_sequence = parsed.sequence;
+        }
+        if (parsed.sequence > t->max_sequence) {
+          t->max_sequence = parsed.sequence;
+        }
+      }
+      if (!iter->status().ok()) {
+        status = iter->status();
+      }
+      delete iter;
+    }
+    Log(options_.info_log, "Table #%llu: %d entries %s",
+        (unsigned long long) t->meta.number,
+        counter,
+        status.ToString().c_str());
+    return status;
+  }
+
+  Status WriteDescriptor() {
+    std::string tmp = TempFileName(dbname_, 1);
+    unique_ptr<WritableFile> file;
+    Status status = env_->NewWritableFile(
+        tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
+    if (!status.ok()) {
+      return status;
+    }
+
+    SequenceNumber max_sequence = 0;
+    for (size_t i = 0; i < tables_.size(); i++) {
+      if (max_sequence < tables_[i].max_sequence) {
+        max_sequence = tables_[i].max_sequence;
+      }
+    }
+
+    edit_->SetComparatorName(icmp_.user_comparator()->Name());
+    edit_->SetLogNumber(0);
+    edit_->SetNextFile(next_file_number_);
+    edit_->SetLastSequence(max_sequence);
+
+    for (size_t i = 0; i < tables_.size(); i++) {
+      // TODO(opt): separate out into multiple levels
+      const TableInfo& t = tables_[i];
+      edit_->AddFile(0, t.meta.number, t.meta.file_size,
+                    t.meta.smallest, t.meta.largest,
+                    t.min_sequence, t.max_sequence);
+    }
+
+    //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
+    {
+      log::Writer log(std::move(file));
+      std::string record;
+      edit_->EncodeTo(&record);
+      status = log.AddRecord(record);
+    }
+
+    if (!status.ok()) {
+      env_->DeleteFile(tmp);
+    } else {
+      // Discard older manifests
+      for (size_t i = 0; i < manifests_.size(); i++) {
+        ArchiveFile(dbname_ + "/" + manifests_[i]);
+      }
+
+      // Install new manifest
+      status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
+      if (status.ok()) {
+        status = SetCurrentFile(env_, dbname_, 1, nullptr);
+      } else {
+        env_->DeleteFile(tmp);
+      }
+    }
+    return status;
+  }
+
+  void ArchiveFile(const std::string& fname) {
+    // Move into another directory.  E.g., for
+    //    dir/foo
+    // rename to
+    //    dir/lost/foo
+    const char* slash = strrchr(fname.c_str(), '/');
+    std::string new_dir;
+    if (slash != nullptr) {
+      new_dir.assign(fname.data(), slash - fname.data());
+    }
+    new_dir.append("/lost");
+    env_->CreateDir(new_dir);  // Ignore error
+    std::string new_file = new_dir;
+    new_file.append("/");
+    new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
+    Status s = env_->RenameFile(fname, new_file);
+    Log(options_.info_log, "Archiving %s: %s\n",
+        fname.c_str(), s.ToString().c_str());
+  }
+};
+}  // namespace
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+  Repairer repairer(dbname, options);
+  return repairer.Run();
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@@ -0,0 +1,800 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <algorithm>
+#include <set>
+
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_builder.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+// IS THIS FILE STILL NEEDED?
+namespace rocksdb {
+
+// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
+// be longer than 150000 bytes and stored data on disk in this format:
+// +--------------------------------------------+  <= key1 offset
+// | key1            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value1                                     |
+// |                                            |
+// +----------------------------------------+---+  <= key2 offset
+// | key2            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+   <= index_block_offset
+// | key1            | key1 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// | key2            | key2 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// | key3            | key3 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// |        ......                              |
+// +-----------------+------------+-------------+
+// | index_block_offset (8 bytes) |
+// +------------------------------+
+
+// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+class SimpleTableReader: public TableReader {
+public:
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table" to the newly opened
+  // table.  The client should delete "*table" when no longer needed.
+  // If there was an error while initializing the table, sets "*table"
+  // to nullptr and returns a non-ok status.  Does not take ownership of
+  // "*source", but the client must ensure that "source" remains live
+  // for the duration of the returned table's lifetime.
+  //
+  // *file must remain live while this Table is in use.
+  static Status Open(const Options& options, const EnvOptions& soptions,
+                     unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+                     unique_ptr<TableReader>* table_reader);
+
+  Iterator* NewIterator(const ReadOptions&, Arena* arena) override;
+
+  Status Get(const ReadOptions&, const Slice& key, void* arg,
+             bool (*handle_result)(void* arg, const ParsedInternalKey& k,
+                                   const Slice& v, bool),
+             void (*mark_key_may_exist)(void*) = nullptr) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& key) override;
+
+  void SetupForCompaction() override;
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  ~SimpleTableReader();
+
+private:
+  struct Rep;
+  Rep* rep_;
+
+  explicit SimpleTableReader(Rep* rep) {
+    rep_ = rep;
+  }
+  friend class TableCache;
+  friend class SimpleTableIterator;
+
+  Status GetOffset(const Slice& target, uint64_t* offset);
+
+  // No copying allowed
+  explicit SimpleTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+
+// Iterator to iterate SimpleTable
+class SimpleTableIterator: public Iterator {
+public:
+  explicit SimpleTableIterator(SimpleTableReader* table);
+  ~SimpleTableIterator();
+
+  bool Valid() const;
+
+  void SeekToFirst();
+
+  void SeekToLast();
+
+  void Seek(const Slice& target);
+
+  void Next();
+
+  void Prev();
+
+  Slice key() const;
+
+  Slice value() const;
+
+  Status status() const;
+
+private:
+  SimpleTableReader* table_;
+  uint64_t offset_;
+  uint64_t next_offset_;
+  Slice key_;
+  Slice value_;
+  char tmp_str_[4];
+  char* key_str_;
+  char* value_str_;
+  int value_str_len_;
+  Status status_;
+  // No copying allowed
+  SimpleTableIterator(const SimpleTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+};
+
+struct SimpleTableReader::Rep {
+  ~Rep() {
+  }
+  Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
+      int num_entries) :
+      soptions(storage_options), index_start_offset(index_start_offset),
+      num_entries(num_entries) {
+  }
+
+  Options options;
+  const EnvOptions& soptions;
+  Status status;
+  unique_ptr<RandomAccessFile> file;
+  uint64_t index_start_offset;
+  int num_entries;
+  std::shared_ptr<TableProperties> table_properties;
+
+  const static int user_key_size = 16;
+  const static int offset_length = 8;
+  const static int key_footer_len = 8;
+
+  static int GetInternalKeyLength() {
+    return user_key_size + key_footer_len;
+  }
+};
+
+SimpleTableReader::~SimpleTableReader() {
+  delete rep_;
+}
+
+Status SimpleTableReader::Open(const Options& options,
+                               const EnvOptions& soptions,
+                               unique_ptr<RandomAccessFile> && file,
+                               uint64_t size,
+                               unique_ptr<TableReader>* table_reader) {
+  char footer_space[Rep::offset_length];
+  Slice footer_input;
+  Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
+                        &footer_input, footer_space);
+  if (s.ok()) {
+    uint64_t index_start_offset = DecodeFixed64(footer_space);
+
+    int num_entries = (size - Rep::offset_length - index_start_offset)
+        / (Rep::GetInternalKeyLength() + Rep::offset_length);
+    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
+                                                             index_start_offset,
+                                                             num_entries);
+
+    rep->file = std::move(file);
+    rep->options = options;
+    table_reader->reset(new SimpleTableReader(rep));
+  }
+  return s;
+}
+
+void SimpleTableReader::SetupForCompaction() {
+}
+
+std::shared_ptr<const TableProperties> SimpleTableReader::GetTableProperties()
+    const {
+  return rep_->table_properties;
+}
+
+Iterator* SimpleTableReader::NewIterator(const ReadOptions& options,
+                                         Arena* arena) {
+  if (arena == nullptr) {
+    return new SimpleTableIterator(this);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(SimpleTableIterator));
+    return new (mem) SimpleTableIterator(this);
+  }
+}
+
+Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
+  uint32_t left = 0;
+  uint32_t right = rep_->num_entries - 1;
+  char key_chars[Rep::GetInternalKeyLength()];
+  Slice tmp_slice;
+
+  uint32_t target_offset = 0;
+  while (left <= right) {
+    uint32_t mid = (left + right + 1) / 2;
+
+    uint64_t offset_to_read = rep_->index_start_offset
+        + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
+    Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
+                                &tmp_slice, key_chars);
+    if (!s.ok()) {
+      return s;
+    }
+
+    InternalKeyComparator ikc(rep_->options.comparator);
+    int compare_result = ikc.Compare(tmp_slice, target);
+
+    if (compare_result < 0) {
+      if (left == right) {
+        target_offset = right + 1;
+        break;
+      }
+      left = mid;
+    } else {
+      if (left == right) {
+        target_offset = left;
+        break;
+      }
+      right = mid - 1;
+    }
+  }
+
+  if (target_offset >= (uint32_t) rep_->num_entries) {
+    *offset = rep_->index_start_offset;
+    return Status::OK();
+  }
+
+  char value_offset_chars[Rep::offset_length];
+
+  int64_t offset_for_value_offset = rep_->index_start_offset
+      + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
+      + Rep::GetInternalKeyLength();
+  Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
+                              &tmp_slice, value_offset_chars);
+  if (s.ok()) {
+    *offset = DecodeFixed64(value_offset_chars);
+  }
+  return s;
+}
+
+Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
+                              void* arg,
+                              bool (*saver)(void*, const ParsedInternalKey&,
+                                            const Slice&, bool),
+                              void (*mark_key_may_exist)(void*)) {
+  Status s;
+  SimpleTableIterator* iter = new SimpleTableIterator(this);
+  for (iter->Seek(k); iter->Valid(); iter->Next()) {
+    ParsedInternalKey parsed_key;
+    if (!ParseInternalKey(iter->key(), &parsed_key)) {
+      return Status::Corruption(Slice());
+    }
+
+    if (!(*saver)(arg, parsed_key, iter->value(), true)) {
+      break;
+    }
+  }
+  s = iter->status();
+  delete iter;
+  return s;
+}
+
+uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
+  return 0;
+}
+
+SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
+    table_(table) {
+  key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
+  value_str_len_ = -1;
+  SeekToFirst();
+}
+
+SimpleTableIterator::~SimpleTableIterator() {
+ delete[] key_str_;
+ if (value_str_len_ >= 0) {
+   delete[] value_str_;
+ }
+}
+
+bool SimpleTableIterator::Valid() const {
+  return offset_ < table_->rep_->index_start_offset;
+}
+
+void SimpleTableIterator::SeekToFirst() {
+  next_offset_ = 0;
+  Next();
+}
+
+void SimpleTableIterator::SeekToLast() {
+  assert(false);
+}
+
+void SimpleTableIterator::Seek(const Slice& target) {
+  Status s = table_->GetOffset(target, &next_offset_);
+  if (!s.ok()) {
+    status_ = s;
+  }
+  Next();
+}
+
+void SimpleTableIterator::Next() {
+  offset_ = next_offset_;
+  if (offset_ >= table_->rep_->index_start_offset) {
+    return;
+  }
+  Slice result;
+  int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
+
+  Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
+                                      key_str_);
+  next_offset_ += internal_key_size;
+  key_ = result;
+
+  Slice value_size_slice;
+  s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
+  next_offset_ += 4;
+  uint32_t value_size = DecodeFixed32(tmp_str_);
+
+  Slice value_slice;
+  if ((int) value_size > value_str_len_) {
+    if (value_str_len_ >= 0) {
+      delete[] value_str_;
+    }
+    value_str_ = new char[value_size];
+    value_str_len_ = value_size;
+  }
+  s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
+                               value_str_);
+  next_offset_ += value_size;
+  value_ = value_slice;
+}
+
+void SimpleTableIterator::Prev() {
+  assert(false);
+}
+
+Slice SimpleTableIterator::key() const {
+  Log(table_->rep_->options.info_log, "key!!!!");
+  return key_;
+}
+
+Slice SimpleTableIterator::value() const {
+  return value_;
+}
+
+Status SimpleTableIterator::status() const {
+  return status_;
+}
+
+class SimpleTableBuilder: public TableBuilder {
+public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  SimpleTableBuilder(const Options& options, WritableFile* file,
+                     CompressionType compression_type);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~SimpleTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+private:
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  SimpleTableBuilder(const SimpleTableBuilder&) = delete;
+  void operator=(const SimpleTableBuilder&) = delete;
+};
+
+struct SimpleTableBuilder::Rep {
+  Options options;
+  WritableFile* file;
+  uint64_t offset = 0;
+  Status status;
+
+  uint64_t num_entries = 0;
+
+  bool closed = false;  // Either Finish() or Abandon() has been called.
+
+  const static int user_key_size = 16;
+  const static int offset_length = 8;
+  const static int key_footer_len = 8;
+
+  static int GetInternalKeyLength() {
+    return user_key_size + key_footer_len;
+  }
+
+  std::string index;
+
+  Rep(const Options& opt, WritableFile* f) :
+      options(opt), file(f) {
+  }
+  ~Rep() {
+  }
+};
+
+SimpleTableBuilder::SimpleTableBuilder(const Options& options,
+                                       WritableFile* file,
+                                       CompressionType compression_type) :
+    rep_(new SimpleTableBuilder::Rep(options, file)) {
+}
+
+SimpleTableBuilder::~SimpleTableBuilder() {
+  delete (rep_);
+}
+
+void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
+  assert((int ) key.size() == Rep::GetInternalKeyLength());
+
+  // Update index
+  rep_->index.append(key.data(), key.size());
+  PutFixed64(&(rep_->index), rep_->offset);
+
+  // Write key-value pair
+  rep_->file->Append(key);
+  rep_->offset += Rep::GetInternalKeyLength();
+
+  std::string size;
+  int value_size = value.size();
+  PutFixed32(&size, value_size);
+  Slice sizeSlice(size);
+  rep_->file->Append(sizeSlice);
+  rep_->file->Append(value);
+  rep_->offset += value_size + 4;
+
+  rep_->num_entries++;
+}
+
+Status SimpleTableBuilder::status() const {
+  return Status::OK();
+}
+
+Status SimpleTableBuilder::Finish() {
+  Rep* r = rep_;
+  assert(!r->closed);
+  r->closed = true;
+
+  uint64_t index_offset = rep_->offset;
+  Slice index_slice(rep_->index);
+  rep_->file->Append(index_slice);
+  rep_->offset += index_slice.size();
+
+  std::string index_offset_str;
+  PutFixed64(&index_offset_str, index_offset);
+  Slice foot_slice(index_offset_str);
+  rep_->file->Append(foot_slice);
+  rep_->offset += foot_slice.size();
+
+  return Status::OK();
+}
+
+void SimpleTableBuilder::Abandon() {
+  rep_->closed = true;
+}
+
+uint64_t SimpleTableBuilder::NumEntries() const {
+  return rep_->num_entries;
+}
+
+uint64_t SimpleTableBuilder::FileSize() const {
+  return rep_->offset;
+}
+
+class SimpleTableFactory: public TableFactory {
+public:
+  ~SimpleTableFactory() {
+  }
+  SimpleTableFactory() {
+  }
+  const char* Name() const override {
+    return "SimpleTable";
+  }
+  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_key,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader) const;
+
+  TableBuilder* NewTableBuilder(const Options& options,
+                                const InternalKeyComparator& internal_key,
+                                WritableFile* file,
+                                CompressionType compression_type) const;
+};
+
+Status SimpleTableFactory::NewTableReader(
+    const Options& options, const EnvOptions& soptions,
+    const InternalKeyComparator& internal_key,
+    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader) const {
+
+  return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
+                                 table_reader);
+}
+
+TableBuilder* SimpleTableFactory::NewTableBuilder(
+    const Options& options, const InternalKeyComparator& internal_key,
+    WritableFile* file, CompressionType compression_type) const {
+  return new SimpleTableBuilder(options, file, compression_type);
+}
+
+class SimpleTableDBTest {
+protected:
+public:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  Options last_options_;
+
+  SimpleTableDBTest() :
+      env_(Env::Default()) {
+    dbname_ = test::TmpDir() + "/simple_table_db_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~SimpleTableDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(new SimpleTableFactory());
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST(SimpleTableDBTest, Empty) {
+  ASSERT_TRUE(db_ != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+TEST(SimpleTableDBTest, ReadWrite) {
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  ASSERT_EQ("v1", Get("0000000000000foo"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("0000000000000foo", "v3"));
+  ASSERT_EQ("v3", Get("0000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(SimpleTableDBTest, Flush) {
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("0000000000000foo", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("0000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(SimpleTableDBTest, Flush2) {
+  ASSERT_OK(Put("0000000000000bar", "b"));
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_OK(Put("0000000000000foo", "v2"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v2", Get("0000000000000foo"));
+
+  ASSERT_OK(Put("0000000000000eee", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("0000000000000eee"));
+
+  ASSERT_OK(Delete("0000000000000bar"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+  ASSERT_OK(Put("0000000000000eee", "v5"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v5", Get("0000000000000eee"));
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+TEST(SimpleTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+      num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/skiplist.h
+++ b/db/skiplist.h
@@ -0,0 +1,429 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress.  Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed.  This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+//
+
+#pragma once
+#include <assert.h>
+#include <stdlib.h>
+#include "util/arena.h"
+#include "port/port.h"
+#include "util/arena.h"
+#include "util/random.h"
+
+namespace rocksdb {
+
+template<typename Key, class Comparator>
+class SkipList {
+ private:
+  struct Node;
+
+ public:
+  // Create a new SkipList object that will use "cmp" for comparing keys,
+  // and will allocate memory using "*arena".  Objects allocated in the arena
+  // must remain allocated for the lifetime of the skiplist object.
+  explicit SkipList(Comparator cmp, Arena* arena,
+                    int32_t max_height = 12, int32_t branching_factor = 4);
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  void Insert(const Key& key);
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  bool Contains(const Key& key) const;
+
+  // Iteration over the contents of a skip list
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(const SkipList* list);
+
+    // Change the underlying skiplist used for this iterator
+    // This enables us not changing the iterator without deallocating
+    // an old one and then allocating a new one
+    void SetList(const SkipList* list);
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const Key& key() const;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next();
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev();
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Key& target);
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToFirst();
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToLast();
+
+   private:
+    const SkipList* list_;
+    Node* node_;
+    // Intentionally copyable
+  };
+
+ private:
+  const int32_t kMaxHeight_;
+  const int32_t kBranching_;
+
+  // Immutable after construction
+  Comparator const compare_;
+  Arena* const arena_;    // Arena used for allocations of nodes
+
+  Node* const head_;
+
+  // Modified only by Insert().  Read racily by readers, but stale
+  // values are ok.
+  port::AtomicPointer max_height_;   // Height of the entire list
+
+  // Used for optimizing sequential insert patterns
+  Node** prev_;
+  int32_t prev_height_;
+
+  inline int GetMaxHeight() const {
+    return static_cast<int>(
+        reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
+  }
+
+  // Read/written only by Insert().
+  Random rnd_;
+
+  Node* NewNode(const Key& key, int height);
+  int RandomHeight();
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+  // Return true if key is greater than the data stored in "n"
+  bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+  // Return the earliest node that comes at or after key.
+  // Return nullptr if there is no such node.
+  //
+  // If prev is non-nullptr, fills prev[level] with pointer to previous
+  // node at "level" for every level in [0..max_height_-1].
+  Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+
+  // Return the latest node with a key < key.
+  // Return head_ if there is no such node.
+  Node* FindLessThan(const Key& key) const;
+
+  // Return the last node in the list.
+  // Return head_ if list is empty.
+  Node* FindLast() const;
+
+  // No copying allowed
+  SkipList(const SkipList&);
+  void operator=(const SkipList&);
+};
+
+// Implementation details follow
+template<typename Key, class Comparator>
+struct SkipList<Key, Comparator>::Node {
+  explicit Node(const Key& k) : key(k) { }
+
+  Key const key;
+
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next(int n) {
+    assert(n >= 0);
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+  }
+  void SetNext(int n, Node* x) {
+    assert(n >= 0);
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_[n].Release_Store(x);
+  }
+
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next(int n) {
+    assert(n >= 0);
+    return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+  }
+  void NoBarrier_SetNext(int n, Node* x) {
+    assert(n >= 0);
+    next_[n].NoBarrier_Store(x);
+  }
+
+ private:
+  // Array of length equal to the node height.  next_[0] is lowest level link.
+  port::AtomicPointer next_[1];
+};
+
+template<typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node*
+SkipList<Key, Comparator>::NewNode(const Key& key, int height) {
+  char* mem = arena_->AllocateAligned(
+      sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+  return new (mem) Node(key);
+}
+
+template<typename Key, class Comparator>
+inline SkipList<Key, Comparator>::Iterator::Iterator(const SkipList* list) {
+  SetList(list);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SetList(const SkipList* list) {
+  list_ = list;
+  node_ = nullptr;
+}
+
+template<typename Key, class Comparator>
+inline bool SkipList<Key, Comparator>::Iterator::Valid() const {
+  return node_ != nullptr;
+}
+
+template<typename Key, class Comparator>
+inline const Key& SkipList<Key, Comparator>::Iterator::key() const {
+  assert(Valid());
+  return node_->key;
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Next() {
+  assert(Valid());
+  node_ = node_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Prev() {
+  // Instead of using explicit "prev" links, we just search for the
+  // last node that falls before key.
+  assert(Valid());
+  node_ = list_->FindLessThan(node_->key);
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Seek(const Key& target) {
+  node_ = list_->FindGreaterOrEqual(target, nullptr);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SeekToFirst() {
+  node_ = list_->head_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SeekToLast() {
+  node_ = list_->FindLast();
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template<typename Key, class Comparator>
+int SkipList<Key, Comparator>::RandomHeight() {
+  // Increase height with probability 1 in kBranching
+  int height = 1;
+  while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) {
+    height++;
+  }
+  assert(height > 0);
+  assert(height <= kMaxHeight_);
+  return height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key, Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+  // nullptr n is considered infinite
+  return (n != nullptr) && (compare_(n->key, key) < 0);
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::
+  FindGreaterOrEqual(const Key& key, Node** prev) const {
+  // Use prev as an optimization hint and fallback to slow path
+  if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) {
+    Node* x = prev[0];
+    Node* next = x->Next(0);
+    if ((x == head_) || KeyIsAfterNode(key, x)) {
+      // Adjust all relevant insertion points to the previous entry
+      for (int i = 1; i < prev_height_; i++) {
+        prev[i] = x;
+      }
+      return next;
+    }
+  }
+  // Normal lookup
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x));
+    if (KeyIsAfterNode(key, next)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      if (prev != nullptr) prev[level] = x;
+      if (level == 0) {
+        return next;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node*
+SkipList<Key, Comparator>::FindLessThan(const Key& key) const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    assert(x == head_ || compare_(x->key, key) < 0);
+    Node* next = x->Next(level);
+    if (next == nullptr || compare_(next->key, key) >= 0) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
+    const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    if (next == nullptr) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+SkipList<Key, Comparator>::SkipList(const Comparator cmp, Arena* arena,
+                                   int32_t max_height,
+                                   int32_t branching_factor)
+    : kMaxHeight_(max_height),
+      kBranching_(branching_factor),
+      compare_(cmp),
+      arena_(arena),
+      head_(NewNode(0 /* any key will do */, max_height)),
+      max_height_(reinterpret_cast<void*>(1)),
+      prev_height_(1),
+      rnd_(0xdeadbeef) {
+  assert(kMaxHeight_ > 0);
+  assert(kBranching_ > 0);
+  // Allocate the prev_ Node* array, directly from the passed-in arena.
+  // prev_ does not need to be freed, as its life cycle is tied up with
+  // the arena as a whole.
+  prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
+  for (int i = 0; i < kMaxHeight_; i++) {
+    head_->SetNext(i, nullptr);
+    prev_[i] = head_;
+  }
+}
+
+template<typename Key, class Comparator>
+void SkipList<Key, Comparator>::Insert(const Key& key) {
+  // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+  // here since Insert() is externally synchronized.
+  Node* x = FindGreaterOrEqual(key, prev_);
+
+  // Our data structure does not allow duplicate insertion
+  assert(x == nullptr || !Equal(key, x->key));
+
+  int height = RandomHeight();
+  if (height > GetMaxHeight()) {
+    for (int i = GetMaxHeight(); i < height; i++) {
+      prev_[i] = head_;
+    }
+    //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
+
+    // It is ok to mutate max_height_ without any synchronization
+    // with concurrent readers.  A concurrent reader that observes
+    // the new value of max_height_ will see either the old value of
+    // new level pointers from head_ (nullptr), or a new value set in
+    // the loop below.  In the former case the reader will
+    // immediately drop to the next level since nullptr sorts after all
+    // keys.  In the latter case the reader will use the new node.
+    max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+  }
+
+  x = NewNode(key, height);
+  for (int i = 0; i < height; i++) {
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i));
+    prev_[i]->SetNext(i, x);
+  }
+  prev_[0] = x;
+  prev_height_ = height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key, Comparator>::Contains(const Key& key) const {
+  Node* x = FindGreaterOrEqual(key, nullptr);
+  if (x != nullptr && Equal(key, x->key)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace rocksdb
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@@ -0,0 +1,383 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/skiplist.h"
+#include <set>
+#include "rocksdb/env.h"
+#include "util/arena.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+typedef uint64_t Key;
+
+struct TestComparator {
+  int operator()(const Key& a, const Key& b) const {
+    if (a < b) {
+      return -1;
+    } else if (a > b) {
+      return +1;
+    } else {
+      return 0;
+    }
+  }
+};
+
+class SkipTest { };
+
+TEST(SkipTest, Empty) {
+  Arena arena;
+  TestComparator cmp;
+  SkipList<Key, TestComparator> list(cmp, &arena);
+  ASSERT_TRUE(!list.Contains(10));
+
+  SkipList<Key, TestComparator>::Iterator iter(&list);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToFirst();
+  ASSERT_TRUE(!iter.Valid());
+  iter.Seek(100);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToLast();
+  ASSERT_TRUE(!iter.Valid());
+}
+
+TEST(SkipTest, InsertAndLookup) {
+  const int N = 2000;
+  const int R = 5000;
+  Random rnd(1000);
+  std::set<Key> keys;
+  Arena arena;
+  TestComparator cmp;
+  SkipList<Key, TestComparator> list(cmp, &arena);
+  for (int i = 0; i < N; i++) {
+    Key key = rnd.Next() % R;
+    if (keys.insert(key).second) {
+      list.Insert(key);
+    }
+  }
+
+  for (int i = 0; i < R; i++) {
+    if (list.Contains(i)) {
+      ASSERT_EQ(keys.count(i), 1U);
+    } else {
+      ASSERT_EQ(keys.count(i), 0U);
+    }
+  }
+
+  // Simple iterator tests
+  {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    ASSERT_TRUE(!iter.Valid());
+
+    iter.Seek(0);
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToFirst();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToLast();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.rbegin()), iter.key());
+  }
+
+  // Forward iteration test
+  for (int i = 0; i < R; i++) {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    iter.Seek(i);
+
+    // Compare against model iterator
+    std::set<Key>::iterator model_iter = keys.lower_bound(i);
+    for (int j = 0; j < 3; j++) {
+      if (model_iter == keys.end()) {
+        ASSERT_TRUE(!iter.Valid());
+        break;
+      } else {
+        ASSERT_TRUE(iter.Valid());
+        ASSERT_EQ(*model_iter, iter.key());
+        ++model_iter;
+        iter.Next();
+      }
+    }
+  }
+
+  // Backward iteration test
+  {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    iter.SeekToLast();
+
+    // Compare against model iterator
+    for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
+         model_iter != keys.rend();
+         ++model_iter) {
+      ASSERT_TRUE(iter.Valid());
+      ASSERT_EQ(*model_iter, iter.key());
+      iter.Prev();
+    }
+    ASSERT_TRUE(!iter.Valid());
+  }
+}
+
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructor.  Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+//     <key,gen,hash>
+// where:
+//     key is in range [0..K-1]
+//     gen is a generation number for key
+//     hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key.  We then iterate, including random
+// calls to Next() and Seek().  For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ private:
+  static const uint32_t K = 4;
+
+  static uint64_t key(Key key) { return (key >> 40); }
+  static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+  static uint64_t hash(Key key) { return key & 0xff; }
+
+  static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+    uint64_t data[2] = { k, g };
+    return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+  }
+
+  static Key MakeKey(uint64_t k, uint64_t g) {
+    assert(sizeof(Key) == sizeof(uint64_t));
+    assert(k <= K);  // We sometimes pass K to seek to the end of the skiplist
+    assert(g <= 0xffffffffu);
+    return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+  }
+
+  static bool IsValidKey(Key k) {
+    return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+  }
+
+  static Key RandomTarget(Random* rnd) {
+    switch (rnd->Next() % 10) {
+      case 0:
+        // Seek to beginning
+        return MakeKey(0, 0);
+      case 1:
+        // Seek to end
+        return MakeKey(K, 0);
+      default:
+        // Seek to middle
+        return MakeKey(rnd->Next() % K, 0);
+    }
+  }
+
+  // Per-key generation
+  struct State {
+    port::AtomicPointer generation[K];
+    void Set(int k, intptr_t v) {
+      generation[k].Release_Store(reinterpret_cast<void*>(v));
+    }
+    intptr_t Get(int k) {
+      return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+    }
+
+    State() {
+      for (unsigned int k = 0; k < K; k++) {
+        Set(k, 0);
+      }
+    }
+  };
+
+  // Current state of the test
+  State current_;
+
+  Arena arena_;
+
+  // SkipList is not protected by mu_.  We just use a single writer
+  // thread to modify it.
+  SkipList<Key, TestComparator> list_;
+
+ public:
+  ConcurrentTest() : list_(TestComparator(), &arena_) {}
+
+  // REQUIRES: External synchronization
+  void WriteStep(Random* rnd) {
+    const uint32_t k = rnd->Next() % K;
+    const intptr_t g = current_.Get(k) + 1;
+    const Key key = MakeKey(k, g);
+    list_.Insert(key);
+    current_.Set(k, g);
+  }
+
+  void ReadStep(Random* rnd) {
+    // Remember the initial committed state of the skiplist.
+    State initial_state;
+    for (unsigned int k = 0; k < K; k++) {
+      initial_state.Set(k, current_.Get(k));
+    }
+
+    Key pos = RandomTarget(rnd);
+    SkipList<Key, TestComparator>::Iterator iter(&list_);
+    iter.Seek(pos);
+    while (true) {
+      Key current;
+      if (!iter.Valid()) {
+        current = MakeKey(K, 0);
+      } else {
+        current = iter.key();
+        ASSERT_TRUE(IsValidKey(current)) << current;
+      }
+      ASSERT_LE(pos, current) << "should not go backwards";
+
+      // Verify that everything in [pos,current) was not present in
+      // initial_state.
+      while (pos < current) {
+        ASSERT_LT(key(pos), K) << pos;
+
+        // Note that generation 0 is never inserted, so it is ok if
+        // <*,0,*> is missing.
+        ASSERT_TRUE((gen(pos) == 0U) ||
+                    (gen(pos) > (uint64_t)initial_state.Get(key(pos)))
+                    ) << "key: " << key(pos)
+                      << "; gen: " << gen(pos)
+                      << "; initgen: "
+                      << initial_state.Get(key(pos));
+
+        // Advance to next key in the valid key space
+        if (key(pos) < key(current)) {
+          pos = MakeKey(key(pos) + 1, 0);
+        } else {
+          pos = MakeKey(key(pos), gen(pos) + 1);
+        }
+      }
+
+      if (!iter.Valid()) {
+        break;
+      }
+
+      if (rnd->Next() % 2) {
+        iter.Next();
+        pos = MakeKey(key(pos), gen(pos) + 1);
+      } else {
+        Key new_target = RandomTarget(rnd);
+        if (new_target > pos) {
+          pos = new_target;
+          iter.Seek(new_target);
+        }
+      }
+    }
+  }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST(SkipTest, ConcurrentWithoutThreads) {
+  ConcurrentTest test;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 10000; i++) {
+    test.ReadStep(&rnd);
+    test.WriteStep(&rnd);
+  }
+}
+
+class TestState {
+ public:
+  ConcurrentTest t_;
+  int seed_;
+  port::AtomicPointer quit_flag_;
+
+  enum ReaderState {
+    STARTING,
+    RUNNING,
+    DONE
+  };
+
+  explicit TestState(int s)
+      : seed_(s),
+        quit_flag_(nullptr),
+        state_(STARTING),
+        state_cv_(&mu_) {}
+
+  void Wait(ReaderState s) {
+    mu_.Lock();
+    while (state_ != s) {
+      state_cv_.Wait();
+    }
+    mu_.Unlock();
+  }
+
+  void Change(ReaderState s) {
+    mu_.Lock();
+    state_ = s;
+    state_cv_.Signal();
+    mu_.Unlock();
+  }
+
+ private:
+  port::Mutex mu_;
+  ReaderState state_;
+  port::CondVar state_cv_;
+};
+
+static void ConcurrentReader(void* arg) {
+  TestState* state = reinterpret_cast<TestState*>(arg);
+  Random rnd(state->seed_);
+  int64_t reads = 0;
+  state->Change(TestState::RUNNING);
+  while (!state->quit_flag_.Acquire_Load()) {
+    state->t_.ReadStep(&rnd);
+    ++reads;
+  }
+  state->Change(TestState::DONE);
+}
+
+static void RunConcurrent(int run) {
+  const int seed = test::RandomSeed() + (run * 100);
+  Random rnd(seed);
+  const int N = 1000;
+  const int kSize = 1000;
+  for (int i = 0; i < N; i++) {
+    if ((i % 100) == 0) {
+      fprintf(stderr, "Run %d of %d\n", i, N);
+    }
+    TestState state(seed + 1);
+    Env::Default()->Schedule(ConcurrentReader, &state);
+    state.Wait(TestState::RUNNING);
+    for (int i = 0; i < kSize; i++) {
+      state.t_.WriteStep(&rnd);
+    }
+    state.quit_flag_.Release_Store(&state);  // Any non-nullptr arg will do
+    state.Wait(TestState::DONE);
+  }
+}
+
+TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/snapshot.h
+++ b/db/snapshot.h
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+  SequenceNumber number_;  // const after creation
+
+ private:
+  friend class SnapshotList;
+
+  // SnapshotImpl is kept in a doubly-linked circular list
+  SnapshotImpl* prev_;
+  SnapshotImpl* next_;
+
+  SnapshotList* list_;                 // just for sanity checks
+};
+
+class SnapshotList {
+ public:
+  SnapshotList() {
+    list_.prev_ = &list_;
+    list_.next_ = &list_;
+    list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
+  }
+
+  bool empty() const { return list_.next_ == &list_; }
+  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
+  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
+
+  const SnapshotImpl* New(SequenceNumber seq) {
+    SnapshotImpl* s = new SnapshotImpl;
+    s->number_ = seq;
+    s->list_ = this;
+    s->next_ = &list_;
+    s->prev_ = list_.prev_;
+    s->prev_->next_ = s;
+    s->next_->prev_ = s;
+    return s;
+  }
+
+  void Delete(const SnapshotImpl* s) {
+    assert(s->list_ == this);
+    s->prev_->next_ = s->next_;
+    s->next_->prev_ = s->prev_;
+    delete s;
+  }
+
+  // retrieve all snapshot numbers. They are sorted in ascending order.
+  void getAll(std::vector<SequenceNumber>& ret) {
+    if (empty()) return;
+    SnapshotImpl* s = &list_;
+    while (s->next_ != &list_) {
+      ret.push_back(s->next_->number_);
+      s = s ->next_;
+    }
+  }
+
+  // get the sequence number of the most recent snapshot
+  const SequenceNumber GetNewest() {
+    if (empty()) {
+      return 0;
+    }
+    return newest()->number_;
+  }
+
+ private:
+  // Dummy head of doubly-linked list of snapshots
+  SnapshotImpl list_;
+};
+
+}  // namespace rocksdb
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -0,0 +1,198 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/filename.h"
+#include "db/version_edit.h"
+
+#include "rocksdb/statistics.h"
+#include "table/iterator_wrapper.h"
+#include "table/table_reader.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+static void DeleteEntry(const Slice& key, void* value) {
+  TableReader* table_reader = reinterpret_cast<TableReader*>(value);
+  delete table_reader;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+  Cache* cache = reinterpret_cast<Cache*>(arg1);
+  Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+  cache->Release(h);
+}
+
+static Slice GetSliceForFileNumber(uint64_t* file_number) {
+  return Slice(reinterpret_cast<const char*>(file_number),
+               sizeof(*file_number));
+}
+
+TableCache::TableCache(const std::string& dbname, const Options* options,
+                       const EnvOptions& storage_options, Cache* const cache)
+    : env_(options->env),
+      dbname_(dbname),
+      options_(options),
+      storage_options_(storage_options),
+      cache_(cache) {}
+
+TableCache::~TableCache() {
+}
+
+TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
+  return reinterpret_cast<TableReader*>(cache_->Value(handle));
+}
+
+void TableCache::ReleaseHandle(Cache::Handle* handle) {
+  cache_->Release(handle);
+}
+
+Status TableCache::FindTable(const EnvOptions& toptions,
+                             const InternalKeyComparator& internal_comparator,
+                             uint64_t file_number, uint64_t file_size,
+                             Cache::Handle** handle, bool* table_io,
+                             const bool no_io) {
+  Status s;
+  Slice key = GetSliceForFileNumber(&file_number);
+  *handle = cache_->Lookup(key);
+  if (*handle == nullptr) {
+    if (no_io) { // Dont do IO and return a not-found status
+      return Status::Incomplete("Table not found in table_cache, no_io is set");
+    }
+    if (table_io != nullptr) {
+      *table_io = true;    // we had to do IO from storage
+    }
+    std::string fname = TableFileName(dbname_, file_number);
+    unique_ptr<RandomAccessFile> file;
+    unique_ptr<TableReader> table_reader;
+    s = env_->NewRandomAccessFile(fname, &file, toptions);
+    RecordTick(options_->statistics.get(), NO_FILE_OPENS);
+    if (s.ok()) {
+      if (options_->advise_random_on_open) {
+        file->Hint(RandomAccessFile::RANDOM);
+      }
+      StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
+      s = options_->table_factory->NewTableReader(
+          *options_, toptions, internal_comparator, std::move(file), file_size,
+          &table_reader);
+    }
+
+    if (!s.ok()) {
+      assert(table_reader == nullptr);
+      RecordTick(options_->statistics.get(), NO_FILE_ERRORS);
+      // We do not cache error results so that if the error is transient,
+      // or somebody repairs the file, we recover automatically.
+    } else {
+      assert(file.get() == nullptr);
+      *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
+    }
+  }
+  return s;
+}
+
+Iterator* TableCache::NewIterator(const ReadOptions& options,
+                                  const EnvOptions& toptions,
+                                  const InternalKeyComparator& icomparator,
+                                  const FileMetaData& file_meta,
+                                  TableReader** table_reader_ptr,
+                                  bool for_compaction, Arena* arena) {
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = nullptr;
+  }
+  TableReader* table_reader = file_meta.table_reader;
+  Cache::Handle* handle = nullptr;
+  Status s;
+  if (table_reader == nullptr) {
+    s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size,
+                  &handle, nullptr, options.read_tier == kBlockCacheTier);
+    if (!s.ok()) {
+      return NewErrorIterator(s, arena);
+    }
+    table_reader = GetTableReaderFromHandle(handle);
+  }
+
+  Iterator* result = table_reader->NewIterator(options, arena);
+  if (handle != nullptr) {
+    result->RegisterCleanup(&UnrefEntry, cache_, handle);
+  }
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = table_reader;
+  }
+
+  if (for_compaction) {
+    table_reader->SetupForCompaction();
+  }
+
+  return result;
+}
+
+Status TableCache::Get(const ReadOptions& options,
+                       const InternalKeyComparator& internal_comparator,
+                       const FileMetaData& file_meta, const Slice& k, void* arg,
+                       bool (*saver)(void*, const ParsedInternalKey&,
+                                     const Slice&, bool),
+                       bool* table_io, void (*mark_key_may_exist)(void*)) {
+  TableReader* t = file_meta.table_reader;
+  Status s;
+  Cache::Handle* handle = nullptr;
+  if (!t) {
+    s = FindTable(storage_options_, internal_comparator, file_meta.number,
+                  file_meta.file_size, &handle, table_io,
+                  options.read_tier == kBlockCacheTier);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+  }
+  if (s.ok()) {
+    s = t->Get(options, k, arg, saver, mark_key_may_exist);
+    if (handle != nullptr) {
+      ReleaseHandle(handle);
+    }
+  } else if (options.read_tier && s.IsIncomplete()) {
+    // Couldnt find Table in cache but treat as kFound if no_io set
+    (*mark_key_may_exist)(arg);
+    return Status::OK();
+  }
+  return s;
+}
+Status TableCache::GetTableProperties(
+    const EnvOptions& toptions,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    std::shared_ptr<const TableProperties>* properties, bool no_io) {
+  Status s;
+  auto table_reader = file_meta.table_reader;
+  // table already been pre-loaded?
+  if (table_reader) {
+    *properties = table_reader->GetTableProperties();
+
+    return s;
+  }
+
+  bool table_io;
+  Cache::Handle* table_handle = nullptr;
+  s = FindTable(toptions, internal_comparator, file_meta.number,
+                file_meta.file_size, &table_handle, &table_io, no_io);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(table_handle);
+  auto table = GetTableReaderFromHandle(table_handle);
+  *properties = table->GetTableProperties();
+  ReleaseHandle(table_handle);
+  return s;
+}
+
+void TableCache::Evict(Cache* cache, uint64_t file_number) {
+  cache->Erase(GetSliceForFileNumber(&file_number));
+}
+
+}  // namespace rocksdb
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -0,0 +1,95 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#pragma once
+#include <string>
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+
+namespace rocksdb {
+
+class Env;
+class Arena;
+struct FileMetaData;
+
+// TODO(sdong): try to come up with a better API to pass the file information
+//              other than simply passing FileMetaData.
+class TableCache {
+ public:
+  TableCache(const std::string& dbname, const Options* options,
+             const EnvOptions& storage_options, Cache* cache);
+  ~TableCache();
+
+  // Return an iterator for the specified file number (the corresponding
+  // file length must be exactly "file_size" bytes).  If "tableptr" is
+  // non-nullptr, also sets "*tableptr" to point to the Table object
+  // underlying the returned iterator, or nullptr if no Table object underlies
+  // the returned iterator.  The returned "*tableptr" object is owned by
+  // the cache and should not be deleted, and is valid for as long as the
+  // returned iterator is live.
+  Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions,
+                        const InternalKeyComparator& internal_comparator,
+                        const FileMetaData& file_meta,
+                        TableReader** table_reader_ptr = nullptr,
+                        bool for_compaction = false, Arena* arena = nullptr);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call (*handle_result)(arg, found_key, found_value) repeatedly until
+  // it returns false.
+  Status Get(const ReadOptions& options,
+             const InternalKeyComparator& internal_comparator,
+             const FileMetaData& file_meta, const Slice& k, void* arg,
+             bool (*handle_result)(void*, const ParsedInternalKey&,
+                                   const Slice&, bool),
+             bool* table_io, void (*mark_key_may_exist)(void*) = nullptr);
+
+  // Evict any entry for the specified file number
+  static void Evict(Cache* cache, uint64_t file_number);
+
+  // Find table reader
+  Status FindTable(const EnvOptions& toptions,
+                   const InternalKeyComparator& internal_comparator,
+                   uint64_t file_number, uint64_t file_size, Cache::Handle**,
+                   bool* table_io = nullptr, const bool no_io = false);
+
+  // Get TableReader from a cache handle.
+  TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
+
+  // Get the table properties of a given table.
+  // @no_io: indicates if we should load table to the cache if it is not present
+  //         in table cache yet.
+  // @returns: `properties` will be reset on success. Please note that we will
+  //            return Status::Incomplete() if table is not present in cache and
+  //            we set `no_io` to be true.
+  Status GetTableProperties(const EnvOptions& toptions,
+                            const InternalKeyComparator& internal_comparator,
+                            const FileMetaData& file_meta,
+                            std::shared_ptr<const TableProperties>* properties,
+                            bool no_io = false);
+
+  // Release the handle from a cache
+  void ReleaseHandle(Cache::Handle* handle);
+
+ private:
+  Env* const env_;
+  const std::string dbname_;
+  const Options* options_;
+  const EnvOptions& storage_options_;
+  Cache* const cache_;
+};
+
+}  // namespace rocksdb
--- a/db/table_properties_collector.cc
+++ b/db/table_properties_collector.cc
@@ -0,0 +1,83 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/table_properties_collector.h"
+
+#include "db/dbformat.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+Status InternalKeyPropertiesCollector::Add(
+    const Slice& key, const Slice& value) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(key, &ikey)) {
+    return Status::InvalidArgument("Invalid internal key");
+  }
+
+  if (ikey.type == ValueType::kTypeDeletion) {
+    ++deleted_keys_;
+  }
+
+  return Status::OK();
+}
+
+Status InternalKeyPropertiesCollector::Finish(
+    UserCollectedProperties* properties) {
+  assert(properties);
+  assert(properties->find(
+        InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
+  std::string val;
+
+  PutVarint64(&val, deleted_keys_);
+  properties->insert({ InternalKeyTablePropertiesNames::kDeletedKeys, val });
+
+  return Status::OK();
+}
+
+UserCollectedProperties
+InternalKeyPropertiesCollector::GetReadableProperties() const {
+  return {
+    { "kDeletedKeys", std::to_string(deleted_keys_) }
+  };
+}
+
+
+Status UserKeyTablePropertiesCollector::Add(
+    const Slice& key, const Slice& value) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(key, &ikey)) {
+    return Status::InvalidArgument("Invalid internal key");
+  }
+
+  return collector_->Add(ikey.user_key, value);
+}
+
+Status UserKeyTablePropertiesCollector::Finish(
+    UserCollectedProperties* properties) {
+  return collector_->Finish(properties);
+}
+
+UserCollectedProperties
+UserKeyTablePropertiesCollector::GetReadableProperties() const {
+  return collector_->GetReadableProperties();
+}
+
+
+const std::string InternalKeyTablePropertiesNames::kDeletedKeys
+  = "rocksdb.deleted.keys";
+
+uint64_t GetDeletedKeys(
+    const UserCollectedProperties& props) {
+  auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys);
+  if (pos == props.end()) {
+    return 0;
+  }
+  Slice raw = pos->second;
+  uint64_t val = 0;
+  return GetVarint64(&raw, &val) ? val : 0;
+}
+
+}  // namespace rocksdb
--- a/db/table_properties_collector.h
+++ b/db/table_properties_collector.h
@@ -0,0 +1,95 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file defines a collection of statistics collectors.
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+struct InternalKeyTablePropertiesNames {
+  static const std::string kDeletedKeys;
+};
+
+// Collecting the statistics for internal keys. Visible only by internal
+// rocksdb modules.
+class InternalKeyPropertiesCollector : public TablePropertiesCollector {
+ public:
+  virtual Status Add(const Slice& key, const Slice& value) override;
+
+  virtual Status Finish(UserCollectedProperties* properties) override;
+
+  virtual const char* Name() const override {
+    return "InternalKeyPropertiesCollector";
+  }
+
+  UserCollectedProperties GetReadableProperties() const override;
+
+ private:
+  uint64_t deleted_keys_ = 0;
+};
+
+class InternalKeyPropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector() {
+    return new InternalKeyPropertiesCollector();
+  }
+
+  virtual const char* Name() const override {
+    return "InternalKeyPropertiesCollectorFactory";
+  }
+};
+
+// When rocksdb creates a new table, it will encode all "user keys" into
+// "internal keys", which contains meta information of a given entry.
+//
+// This class extracts user key from the encoded internal key when Add() is
+// invoked.
+class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
+ public:
+  // transfer of ownership
+  explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector)
+      : collector_(collector) {}
+
+  virtual ~UserKeyTablePropertiesCollector() {}
+
+  virtual Status Add(const Slice& key, const Slice& value) override;
+
+  virtual Status Finish(UserCollectedProperties* properties) override;
+
+  virtual const char* Name() const override { return collector_->Name(); }
+
+  UserCollectedProperties GetReadableProperties() const override;
+
+ protected:
+  std::unique_ptr<TablePropertiesCollector> collector_;
+};
+
+class UserKeyTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  explicit UserKeyTablePropertiesCollectorFactory(
+      std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
+      : user_collector_factory_(user_collector_factory) {}
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector() {
+    return new UserKeyTablePropertiesCollector(
+        user_collector_factory_->CreateTablePropertiesCollector());
+  }
+
+  virtual const char* Name() const override {
+    return user_collector_factory_->Name();
+  }
+
+ private:
+  std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory_;
+};
+
+}  // namespace rocksdb
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -0,0 +1,313 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/table_properties_collector.h"
+#include "rocksdb/table.h"
+#include "table/block_based_table_factory.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "util/coding.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class TablePropertiesTest {
+};
+
+// TODO(kailiu) the following classes should be moved to some more general
+// places, so that other tests can also make use of them.
+// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system
+// and therefore enable us to quickly setup the tests.
+class FakeWritableFile : public WritableFile {
+ public:
+  ~FakeWritableFile() { }
+
+  const std::string& contents() const { return contents_; }
+
+  virtual Status Close() { return Status::OK(); }
+  virtual Status Flush() { return Status::OK(); }
+  virtual Status Sync() { return Status::OK(); }
+
+  virtual Status Append(const Slice& data) {
+    contents_.append(data.data(), data.size());
+    return Status::OK();
+  }
+
+ private:
+  std::string contents_;
+};
+
+
+class FakeRandomeAccessFile : public RandomAccessFile {
+ public:
+  explicit FakeRandomeAccessFile(const Slice& contents)
+      : contents_(contents.data(), contents.size()) {
+  }
+
+  virtual ~FakeRandomeAccessFile() { }
+
+  uint64_t Size() const { return contents_.size(); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                       char* scratch) const {
+    if (offset > contents_.size()) {
+      return Status::InvalidArgument("invalid Read offset");
+    }
+    if (offset + n > contents_.size()) {
+      n = contents_.size() - offset;
+    }
+    memcpy(scratch, &contents_[offset], n);
+    *result = Slice(scratch, n);
+    return Status::OK();
+  }
+
+ private:
+  std::string contents_;
+};
+
+
+class DumbLogger : public Logger {
+ public:
+  virtual void Logv(const char* format, va_list ap) { }
+  virtual size_t GetLogFileSize() const { return 0; }
+};
+
+// Utilities test functions
+namespace {
+void MakeBuilder(const Options& options,
+                 const InternalKeyComparator& internal_comparator,
+                 std::unique_ptr<FakeWritableFile>* writable,
+                 std::unique_ptr<TableBuilder>* builder) {
+  writable->reset(new FakeWritableFile);
+  builder->reset(options.table_factory->NewTableBuilder(
+      options, internal_comparator, writable->get(), options.compression));
+}
+}  // namespace
+
+// Collects keys that starts with "A" in a table.
+class RegularKeysStartWithA: public TablePropertiesCollector {
+ public:
+   const char* Name() const { return "RegularKeysStartWithA"; }
+
+   Status Finish(UserCollectedProperties* properties) {
+     std::string encoded;
+     PutVarint32(&encoded, count_);
+     *properties = UserCollectedProperties {
+       { "TablePropertiesTest", "Rocksdb" },
+       { "Count", encoded }
+     };
+     return Status::OK();
+   }
+
+   Status Add(const Slice& user_key, const Slice& value) {
+     // simply asssume all user keys are not empty.
+     if (user_key.data()[0] == 'A') {
+       ++count_;
+     }
+     return Status::OK();
+   }
+
+  virtual UserCollectedProperties GetReadableProperties() const {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAFactory : public TablePropertiesCollectorFactory {
+ public:
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector() {
+    return new RegularKeysStartWithA();
+  }
+  const char* Name() const { return "RegularKeysStartWithA"; }
+};
+
+extern uint64_t kBlockBasedTableMagicNumber;
+extern uint64_t kPlainTableMagicNumber;
+namespace {
+void TestCustomizedTablePropertiesCollector(
+    uint64_t magic_number, bool encode_as_internal, const Options& options,
+    const InternalKeyComparator& internal_comparator) {
+  // make sure the entries will be inserted with order.
+  std::map<std::string, std::string> kvs = {
+    {"About   ", "val5"},  // starts with 'A'
+    {"Abstract", "val2"},  // starts with 'A'
+    {"Around  ", "val7"},  // starts with 'A'
+    {"Beyond  ", "val3"},
+    {"Builder ", "val1"},
+    {"Cancel  ", "val4"},
+    {"Find    ", "val6"},
+  };
+
+  // -- Step 1: build table
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<FakeWritableFile> writable;
+  MakeBuilder(options, internal_comparator, &writable, &builder);
+
+  for (const auto& kv : kvs) {
+    if (encode_as_internal) {
+      InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
+      builder->Add(ikey.Encode(), kv.second);
+    } else {
+      builder->Add(kv.first, kv.second);
+    }
+  }
+  ASSERT_OK(builder->Finish());
+
+  // -- Step 2: Read properties
+  FakeRandomeAccessFile readable(writable->contents());
+  TableProperties* props;
+  Status s = ReadTableProperties(
+      &readable,
+      writable->contents().size(),
+      magic_number,
+      Env::Default(),
+      nullptr,
+      &props
+  );
+  std::unique_ptr<TableProperties> props_guard(props);
+  ASSERT_OK(s);
+
+  auto user_collected = props->user_collected_properties;
+
+  ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
+
+  uint32_t starts_with_A = 0;
+  Slice key(user_collected.at("Count"));
+  ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+  ASSERT_EQ(3u, starts_with_A);
+}
+}  // namespace
+
+TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
+  // Test properties collectors with internal keys or regular keys
+  // for block based table
+  for (bool encode_as_internal : { true, false }) {
+    Options options;
+    std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
+        new RegularKeysStartWithAFactory());
+    if (encode_as_internal) {
+      options.table_properties_collector_factories.emplace_back(
+          new UserKeyTablePropertiesCollectorFactory(collector_factory));
+    } else {
+      options.table_properties_collector_factories.resize(1);
+      options.table_properties_collector_factories[0] = collector_factory;
+    }
+    test::PlainInternalKeyComparator ikc(options.comparator);
+    TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber,
+                                           encode_as_internal, options, ikc);
+  }
+
+  // test plain table
+  Options options;
+  options.table_properties_collector_factories.emplace_back(
+      new RegularKeysStartWithAFactory());
+  options.table_factory = std::make_shared<PlainTableFactory>(8, 8, 0);
+  test::PlainInternalKeyComparator ikc(options.comparator);
+  TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options,
+                                         ikc);
+}
+
+namespace {
+void TestInternalKeyPropertiesCollector(
+    uint64_t magic_number,
+    bool sanitized,
+    std::shared_ptr<TableFactory> table_factory) {
+  InternalKey keys[] = {
+    InternalKey("A       ", 0, ValueType::kTypeValue),
+    InternalKey("B       ", 0, ValueType::kTypeValue),
+    InternalKey("C       ", 0, ValueType::kTypeValue),
+    InternalKey("W       ", 0, ValueType::kTypeDeletion),
+    InternalKey("X       ", 0, ValueType::kTypeDeletion),
+    InternalKey("Y       ", 0, ValueType::kTypeDeletion),
+    InternalKey("Z       ", 0, ValueType::kTypeDeletion),
+  };
+
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<FakeWritableFile> writable;
+  Options options;
+  test::PlainInternalKeyComparator pikc(options.comparator);
+
+  options.table_factory = table_factory;
+  if (sanitized) {
+    options.table_properties_collector_factories.emplace_back(
+        new RegularKeysStartWithAFactory());
+    // with sanitization, even regular properties collector will be able to
+    // handle internal keys.
+    auto comparator = options.comparator;
+    // HACK: Set options.info_log to avoid writing log in
+    // SanitizeOptions().
+    options.info_log = std::make_shared<DumbLogger>();
+    options = SanitizeOptions("db",            // just a place holder
+                              &pikc, nullptr,  // don't care filter policy
+                              options);
+    options.comparator = comparator;
+  } else {
+    options.table_properties_collector_factories = {
+        std::make_shared<InternalKeyPropertiesCollectorFactory>()};
+  }
+
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeBuilder(options, pikc, &writable, &builder);
+    for (const auto& k : keys) {
+      builder->Add(k.Encode(), "val");
+    }
+
+    ASSERT_OK(builder->Finish());
+
+    FakeRandomeAccessFile readable(writable->contents());
+    TableProperties* props;
+    Status s =
+        ReadTableProperties(&readable, writable->contents().size(),
+                            magic_number, Env::Default(), nullptr, &props);
+    ASSERT_OK(s);
+
+    std::unique_ptr<TableProperties> props_guard(props);
+    auto user_collected = props->user_collected_properties;
+    uint64_t deleted = GetDeletedKeys(user_collected);
+    ASSERT_EQ(4u, deleted);
+
+    if (sanitized) {
+      uint32_t starts_with_A = 0;
+      Slice key(user_collected.at("Count"));
+      ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+      ASSERT_EQ(1u, starts_with_A);
+    }
+  }
+}
+}  // namespace
+
+TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
+  TestInternalKeyPropertiesCollector(
+      kBlockBasedTableMagicNumber,
+      true /* sanitize */,
+      std::make_shared<BlockBasedTableFactory>()
+  );
+  TestInternalKeyPropertiesCollector(
+      kBlockBasedTableMagicNumber,
+      true /* not sanitize */,
+      std::make_shared<BlockBasedTableFactory>()
+  );
+  TestInternalKeyPropertiesCollector(
+      kPlainTableMagicNumber,
+      false /* not sanitize */,
+      std::make_shared<PlainTableFactory>(8, 8, 0)
+  );
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/tailing_iter.cc
+++ b/db/tailing_iter.cc
@@ -0,0 +1,221 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "db/tailing_iter.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/db_impl.h"
+#include "db/db_iter.h"
+#include "db/column_family.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merger.h"
+
+namespace rocksdb {
+
+TailingIterator::TailingIterator(Env* const env, DBImpl* db,
+    const ReadOptions& read_options, ColumnFamilyData* cfd)
+    : env_(env),
+      db_(db),
+      read_options_(read_options),
+      cfd_(cfd),
+      super_version_(nullptr),
+      current_(nullptr),
+      status_(Status::InvalidArgument("Seek() not called on this iterator")) {}
+
+TailingIterator::~TailingIterator() {
+  Cleanup();
+}
+
+bool TailingIterator::Valid() const {
+  return current_ != nullptr;
+}
+
+void TailingIterator::SeekToFirst() {
+  if (!IsCurrentVersion()) {
+    CreateIterators();
+  }
+
+  mutable_->SeekToFirst();
+  immutable_->SeekToFirst();
+  UpdateCurrent();
+}
+
+void TailingIterator::Seek(const Slice& target) {
+  if (!IsCurrentVersion()) {
+    CreateIterators();
+  }
+
+  mutable_->Seek(target);
+
+  // We maintain the interval (prev_key_, immutable_->key()] such that there
+  // are no records with keys within that range in immutable_ other than
+  // immutable_->key(). Since immutable_ can't change in this version, we don't
+  // need to do a seek if 'target' belongs to that interval (i.e. immutable_ is
+  // already at the correct position)!
+  //
+  // If prefix seek is used and immutable_ is not valid, seek if target has a
+  // different prefix than prev_key.
+  //
+  // prev_key_ is updated by Next(). SeekImmutable() sets prev_key_ to
+  // 'target' -- in this case, prev_key_ is included in the interval, so
+  // prev_inclusive_ has to be set.
+
+  const Comparator* cmp = cfd_->user_comparator();
+  if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ ||
+      (immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) ||
+      (cfd_->options()->prefix_extractor != nullptr && !IsSamePrefix(target))) {
+    SeekImmutable(target);
+  }
+
+  UpdateCurrent();
+}
+
+void TailingIterator::Next() {
+  assert(Valid());
+
+  if (!IsCurrentVersion()) {
+    // save the current key, create new iterators and then seek
+    std::string current_key = key().ToString();
+    Slice key_slice(current_key.data(), current_key.size());
+
+    CreateIterators();
+    Seek(key_slice);
+
+    if (!Valid() || key().compare(key_slice) != 0) {
+      // record with current_key no longer exists
+      return;
+    }
+
+  } else if (current_ == immutable_.get()) {
+    // immutable iterator is advanced -- update prev_key_
+    prev_key_ = key().ToString();
+    is_prev_inclusive_ = false;
+    is_prev_set_ = true;
+  }
+
+  current_->Next();
+  UpdateCurrent();
+}
+
+Slice TailingIterator::key() const {
+  assert(Valid());
+  return current_->key();
+}
+
+Slice TailingIterator::value() const {
+  assert(Valid());
+  return current_->value();
+}
+
+Status TailingIterator::status() const {
+  if (!status_.ok()) {
+    return status_;
+  } else if (!mutable_->status().ok()) {
+    return mutable_->status();
+  } else {
+    return immutable_->status();
+  }
+}
+
+void TailingIterator::Prev() {
+  status_ = Status::NotSupported("This iterator doesn't support Prev()");
+}
+
+void TailingIterator::SeekToLast() {
+  status_ = Status::NotSupported("This iterator doesn't support SeekToLast()");
+}
+
+void TailingIterator::Cleanup() {
+  // Release old super version if necessary
+  mutable_.reset();
+  immutable_.reset();
+  if (super_version_ != nullptr && super_version_->Unref()) {
+    DBImpl::DeletionState deletion_state;
+    db_->mutex_.Lock();
+    super_version_->Cleanup();
+    db_->FindObsoleteFiles(deletion_state, false, true);
+    db_->mutex_.Unlock();
+    delete super_version_;
+    if (deletion_state.HaveSomethingToDelete()) {
+      db_->PurgeObsoleteFiles(deletion_state);
+    }
+  }
+}
+
+void TailingIterator::CreateIterators() {
+  Cleanup();
+  super_version_= cfd_->GetReferencedSuperVersion(&(db_->mutex_));
+
+  Iterator* mutable_iter = super_version_->mem->NewIterator(read_options_);
+  // create a DBIter that only uses memtable content; see NewIterator()
+  mutable_.reset(
+      NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(),
+                    mutable_iter, kMaxSequenceNumber));
+
+  std::vector<Iterator*> list;
+  super_version_->imm->AddIterators(read_options_, &list);
+  super_version_->current->AddIterators(
+      read_options_, *cfd_->soptions(), &list);
+  Iterator* immutable_iter =
+      NewMergingIterator(&cfd_->internal_comparator(), &list[0], list.size());
+
+  // create a DBIter that only uses memtable content; see NewIterator()
+  immutable_.reset(
+      NewDBIterator(env_, *cfd_->options(), cfd_->user_comparator(),
+                    immutable_iter, kMaxSequenceNumber));
+
+  current_ = nullptr;
+  is_prev_set_ = false;
+}
+
+void TailingIterator::UpdateCurrent() {
+  current_ = nullptr;
+
+  if (mutable_->Valid()) {
+    current_ = mutable_.get();
+  }
+  const Comparator* cmp = cfd_->user_comparator();
+  if (immutable_->Valid() &&
+      (current_ == nullptr ||
+       cmp->Compare(immutable_->key(), current_->key()) < 0)) {
+    current_ = immutable_.get();
+  }
+
+  if (!status_.ok()) {
+    // reset status that was set by Prev() or SeekToLast()
+    status_ = Status::OK();
+  }
+}
+
+bool TailingIterator::IsCurrentVersion() const {
+  return super_version_ != nullptr &&
+         super_version_->version_number == cfd_->GetSuperVersionNumber();
+}
+
+bool TailingIterator::IsSamePrefix(const Slice& target) const {
+  const SliceTransform* extractor = cfd_->options()->prefix_extractor.get();
+
+  assert(extractor);
+  assert(is_prev_set_);
+
+  return extractor->Transform(target)
+    .compare(extractor->Transform(prev_key_)) == 0;
+}
+
+void TailingIterator::SeekImmutable(const Slice& target) {
+  prev_key_ = target.ToString();
+  is_prev_inclusive_ = true;
+  is_prev_set_ = true;
+
+  immutable_->Seek(target);
+}
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
--- a/db/tailing_iter.h
+++ b/db/tailing_iter.h
@@ -0,0 +1,97 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+class DBImpl;
+class Env;
+struct SuperVersion;
+class ColumnFamilyData;
+
+/**
+ * TailingIterator is a special type of iterator that doesn't use an (implicit)
+ * snapshot. In other words, it can be used to read data that was added to the
+ * db after the iterator had been created.
+ *
+ * TailingIterator is optimized for sequential reading. It doesn't support
+ * Prev() and SeekToLast() operations.
+ */
+class TailingIterator : public Iterator {
+ public:
+  TailingIterator(Env* const env, DBImpl* db, const ReadOptions& read_options,
+                  ColumnFamilyData* cfd);
+  virtual ~TailingIterator();
+
+  virtual bool Valid() const override;
+  virtual void SeekToFirst() override;
+  virtual void SeekToLast() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void Next() override;
+  virtual void Prev() override;
+  virtual Slice key() const override;
+  virtual Slice value() const override;
+  virtual Status status() const override;
+
+ private:
+  void Cleanup();
+
+  Env* const env_;
+  DBImpl* const db_;
+  const ReadOptions read_options_;
+  ColumnFamilyData* const cfd_;
+  SuperVersion* super_version_;
+
+  // TailingIterator merges the contents of the two iterators below (one using
+  // mutable memtable contents only, other over SSTs and immutable memtables).
+  // See DBIter::GetTailingIteratorPair().
+  std::unique_ptr<Iterator> mutable_;
+  std::unique_ptr<Iterator> immutable_;
+
+  // points to either mutable_ or immutable_
+  Iterator* current_;
+
+  // key that precedes immutable iterator's current key
+  std::string prev_key_;
+
+  // unless prev_set is true, prev_key/prev_head is not valid and shouldn't be
+  // used; reset by createIterators()
+  bool is_prev_set_;
+
+  // prev_key_ was set by SeekImmutable(), which means that the interval of
+  // keys covered by immutable_ is [prev_key_, current], i.e. it includes the
+  // left endpoint
+  bool is_prev_inclusive_;
+
+  // internal iterator status
+  Status status_;
+
+  // check if this iterator's version matches DB's version
+  bool IsCurrentVersion() const;
+
+  // check if SeekImmutable() is needed due to target having a different prefix
+  // than prev_key_ (used when in prefix seek mode)
+  bool IsSamePrefix(const Slice& target) const;
+
+  // creates mutable_ and immutable_ iterators and updates version_number_
+  void CreateIterators();
+
+  // set current_ to be one of the iterators with the smallest key
+  void UpdateCurrent();
+
+  // seek on immutable_ and update prev_key
+  void SeekImmutable(const Slice& target);
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@@ -0,0 +1,262 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "db/transaction_log_impl.h"
+#include "db/write_batch_internal.h"
+
+namespace rocksdb {
+
+TransactionLogIteratorImpl::TransactionLogIteratorImpl(
+    const std::string& dir, const DBOptions* options,
+    const TransactionLogIterator::ReadOptions& read_options,
+    const EnvOptions& soptions, const SequenceNumber seq,
+    std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
+    : dir_(dir),
+      options_(options),
+      read_options_(read_options),
+      soptions_(soptions),
+      startingSequenceNumber_(seq),
+      files_(std::move(files)),
+      started_(false),
+      isValid_(false),
+      currentFileIndex_(0),
+      currentBatchSeq_(0),
+      currentLastSeq_(0),
+      dbimpl_(dbimpl) {
+  assert(files_ != nullptr);
+  assert(dbimpl_ != nullptr);
+
+  reporter_.env = options_->env;
+  reporter_.info_log = options_->info_log.get();
+  SeekToStartSequence(); // Seek till starting sequence
+}
+
+Status TransactionLogIteratorImpl::OpenLogFile(
+    const LogFile* logFile,
+    unique_ptr<SequentialFile>* file) {
+  Env* env = options_->env;
+  if (logFile->Type() == kArchivedLogFile) {
+    std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber());
+    return env->NewSequentialFile(fname, file, soptions_);
+  } else {
+    std::string fname = LogFileName(dir_, logFile->LogNumber());
+    Status status = env->NewSequentialFile(fname, file, soptions_);
+    if (!status.ok()) {
+      //  If cannot open file in DB directory.
+      //  Try the archive dir, as it could have moved in the meanwhile.
+      fname = ArchivedLogFileName(dir_, logFile->LogNumber());
+      status = env->NewSequentialFile(fname, file, soptions_);
+    }
+    return status;
+  }
+}
+
+BatchResult TransactionLogIteratorImpl::GetBatch()  {
+  assert(isValid_);  //  cannot call in a non valid state.
+  BatchResult result;
+  result.sequence = currentBatchSeq_;
+  result.writeBatchPtr = std::move(currentBatch_);
+  return result;
+}
+
+Status TransactionLogIteratorImpl::status() {
+  return currentStatus_;
+}
+
+bool TransactionLogIteratorImpl::Valid() {
+  return started_ && isValid_;
+}
+
+bool TransactionLogIteratorImpl::RestrictedRead(
+    Slice* record,
+    std::string* scratch) {
+  // Don't read if no more complete entries to read from logs
+  if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
+    return false;
+  }
+  return currentLogReader_->ReadRecord(record, scratch);
+}
+
+void TransactionLogIteratorImpl::SeekToStartSequence(
+    uint64_t startFileIndex,
+    bool strict) {
+  std::string scratch;
+  Slice record;
+  started_ = false;
+  isValid_ = false;
+  if (files_->size() <= startFileIndex) {
+    return;
+  }
+  Status s = OpenLogReader(files_->at(startFileIndex).get());
+  if (!s.ok()) {
+    currentStatus_ = s;
+    reporter_.Info(currentStatus_.ToString().c_str());
+    return;
+  }
+  while (RestrictedRead(&record, &scratch)) {
+    if (record.size() < 12) {
+      reporter_.Corruption(
+        record.size(), Status::Corruption("very small log record"));
+      continue;
+    }
+    UpdateCurrentWriteBatch(record);
+    if (currentLastSeq_ >= startingSequenceNumber_) {
+      if (strict && currentBatchSeq_ != startingSequenceNumber_) {
+        currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
+                                            "seek to required sequence number");
+        reporter_.Info(currentStatus_.ToString().c_str());
+        return;
+      } else if (strict) {
+        reporter_.Info("Could seek required sequence number. Iterator will "
+                       "continue.");
+      }
+      isValid_ = true;
+      started_ = true; // set started_ as we could seek till starting sequence
+      return;
+    } else {
+      isValid_ = false;
+    }
+  }
+
+  // Could not find start sequence in first file. Normally this must be the
+  // only file. Otherwise log the error and let the iterator return next entry
+  // If strict is set, we want to seek exactly till the start sequence and it
+  // should have been present in the file we scanned above
+  if (strict) {
+    currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
+                                        "seek to required sequence number");
+    reporter_.Info(currentStatus_.ToString().c_str());
+  } else if (files_->size() != 1) {
+    currentStatus_ = Status::Corruption("Start sequence was not found, "
+                                        "skipping to the next available");
+    reporter_.Info(currentStatus_.ToString().c_str());
+    // Let NextImpl find the next available entry. started_ remains false
+    // because we don't want to check for gaps while moving to start sequence
+    NextImpl(true);
+  }
+}
+
+void TransactionLogIteratorImpl::Next() {
+  return NextImpl(false);
+}
+
+void TransactionLogIteratorImpl::NextImpl(bool internal) {
+  std::string scratch;
+  Slice record;
+  isValid_ = false;
+  if (!internal && !started_) {
+    // Runs every time until we can seek to the start sequence
+    return SeekToStartSequence();
+  }
+  while(true) {
+    assert(currentLogReader_);
+    if (currentLogReader_->IsEOF()) {
+      currentLogReader_->UnmarkEOF();
+    }
+    while (RestrictedRead(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter_.Corruption(
+          record.size(), Status::Corruption("very small log record"));
+        continue;
+      } else {
+        // started_ should be true if called by application
+        assert(internal || started_);
+        // started_ should be false if called internally
+        assert(!internal || !started_);
+        UpdateCurrentWriteBatch(record);
+        if (internal && !started_) {
+          started_ = true;
+        }
+        return;
+      }
+    }
+
+    // Open the next file
+    if (currentFileIndex_ < files_->size() - 1) {
+      ++currentFileIndex_;
+      Status status =OpenLogReader(files_->at(currentFileIndex_).get());
+      if (!status.ok()) {
+        isValid_ = false;
+        currentStatus_ = status;
+        return;
+      }
+    } else {
+      isValid_ = false;
+      if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
+        currentStatus_ = Status::OK();
+      } else {
+        currentStatus_ = Status::Corruption("NO MORE DATA LEFT");
+      }
+      return;
+    }
+  }
+}
+
+bool TransactionLogIteratorImpl::IsBatchExpected(
+    const WriteBatch* batch,
+    const SequenceNumber expectedSeq) {
+  assert(batch);
+  SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
+  if (batchSeq != expectedSeq) {
+    char buf[200];
+    snprintf(buf, sizeof(buf),
+             "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
+             "Last flushed seq=%lu.Log iterator will reseek the correct "
+             "batch.",
+             (unsigned long)batchSeq,
+             (unsigned long)expectedSeq,
+             (unsigned long)dbimpl_->GetLatestSequenceNumber());
+    reporter_.Info(buf);
+    return false;
+  }
+  return true;
+}
+
+void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
+  std::unique_ptr<WriteBatch> batch(new WriteBatch());
+  WriteBatchInternal::SetContents(batch.get(), record);
+
+  SequenceNumber expectedSeq = currentLastSeq_ + 1;
+  // If the iterator has started, then confirm that we get continuous batches
+  if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) {
+    // Seek to the batch having expected sequence number
+    if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) {
+      // Expected batch must lie in the previous log file
+      // Avoid underflow.
+      if (currentFileIndex_ != 0) {
+        currentFileIndex_--;
+      }
+    }
+    startingSequenceNumber_ = expectedSeq;
+    // currentStatus_ will be set to Ok if reseek succeeds
+    currentStatus_ = Status::NotFound("Gap in sequence numbers");
+    return SeekToStartSequence(currentFileIndex_, true);
+  }
+
+  currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
+  currentLastSeq_ = currentBatchSeq_ +
+                    WriteBatchInternal::Count(batch.get()) - 1;
+  // currentBatchSeq_ can only change here
+  assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
+
+  currentBatch_ = move(batch);
+  isValid_ = true;
+  currentStatus_ = Status::OK();
+}
+
+Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
+  unique_ptr<SequentialFile> file;
+  Status status = OpenLogFile(logFile, &file);
+  if (!status.ok()) {
+    return status;
+  }
+  assert(file);
+  currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,
+                                          read_options_.verify_checksums_, 0));
+  return Status::OK();
+}
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@@ -0,0 +1,120 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+#include "db/db_impl.h"
+#include "db/log_reader.h"
+#include "db/filename.h"
+
+namespace rocksdb {
+
+struct LogReporter : public log::Reader::Reporter {
+  Env* env;
+  Logger* info_log;
+  virtual void Corruption(size_t bytes, const Status& s) {
+    Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
+  }
+  virtual void Info(const char* s) {
+    Log(info_log, "%s", s);
+  }
+};
+
+class LogFileImpl : public LogFile {
+ public:
+  LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
+              uint64_t sizeBytes) :
+    logNumber_(logNum),
+    type_(logType),
+    startSequence_(startSeq),
+    sizeFileBytes_(sizeBytes) {
+  }
+
+  std::string PathName() const {
+    if (type_ == kArchivedLogFile) {
+      return ArchivedLogFileName("", logNumber_);
+    }
+    return LogFileName("", logNumber_);
+  }
+
+  uint64_t LogNumber() const { return logNumber_; }
+
+  WalFileType Type() const { return type_; }
+
+  SequenceNumber StartSequence() const { return startSequence_; }
+
+  uint64_t SizeFileBytes() const { return sizeFileBytes_; }
+
+  bool operator < (const LogFile& that) const {
+    return LogNumber() < that.LogNumber();
+  }
+
+ private:
+  uint64_t logNumber_;
+  WalFileType type_;
+  SequenceNumber startSequence_;
+  uint64_t sizeFileBytes_;
+
+};
+
+class TransactionLogIteratorImpl : public TransactionLogIterator {
+ public:
+  TransactionLogIteratorImpl(
+      const std::string& dir, const DBOptions* options,
+      const TransactionLogIterator::ReadOptions& read_options,
+      const EnvOptions& soptions, const SequenceNumber seqNum,
+      std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
+
+  virtual bool Valid();
+
+  virtual void Next();
+
+  virtual Status status();
+
+  virtual BatchResult GetBatch();
+
+ private:
+  const std::string& dir_;
+  const DBOptions* options_;
+  const TransactionLogIterator::ReadOptions read_options_;
+  const EnvOptions& soptions_;
+  SequenceNumber startingSequenceNumber_;
+  std::unique_ptr<VectorLogPtr> files_;
+  bool started_;
+  bool isValid_;  // not valid when it starts of.
+  Status currentStatus_;
+  size_t currentFileIndex_;
+  std::unique_ptr<WriteBatch> currentBatch_;
+  unique_ptr<log::Reader> currentLogReader_;
+  Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
+  LogReporter reporter_;
+  SequenceNumber currentBatchSeq_; // sequence number at start of current batch
+  SequenceNumber currentLastSeq_; // last sequence in the current batch
+  DBImpl const * const dbimpl_; // The db on whose log files this iterates
+
+  // Reads from transaction log only if the writebatch record has been written
+  bool RestrictedRead(Slice* record, std::string* scratch);
+  // Seeks to startingSequenceNumber reading from startFileIndex in files_.
+  // If strict is set,then must get a batch starting with startingSequenceNumber
+  void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false);
+  // Implementation of Next. SeekToStartSequence calls it internally with
+  // internal=true to let it find next entry even if it has to jump gaps because
+  // the iterator may start off from the first available entry but promises to
+  // be continuous after that
+  void NextImpl(bool internal = false);
+  // Check if batch is expected, else return false
+  bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq);
+  // Update current batch if a continuous batch is found, else return false
+  void UpdateCurrentWriteBatch(const Slice& record);
+  Status OpenLogReader(const LogFile* file);
+};
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -0,0 +1,364 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/version_set.h"
+#include "util/coding.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed.
+enum Tag {
+  kComparator           = 1,
+  kLogNumber            = 2,
+  kNextFileNumber       = 3,
+  kLastSequence         = 4,
+  kCompactPointer       = 5,
+  kDeletedFile          = 6,
+  kNewFile              = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber        = 9,
+
+  // these are new formats divergent from open source leveldb
+  kNewFile2             = 100,  // store smallest & largest seqno
+
+  kColumnFamily         = 200,  // specify column family for version edit
+  kColumnFamilyAdd      = 201,
+  kColumnFamilyDrop     = 202,
+  kMaxColumnFamily      = 203,
+};
+
+void VersionEdit::Clear() {
+  comparator_.clear();
+  max_level_ = 0;
+  log_number_ = 0;
+  prev_log_number_ = 0;
+  last_sequence_ = 0;
+  next_file_number_ = 0;
+  max_column_family_ = 0;
+  has_comparator_ = false;
+  has_log_number_ = false;
+  has_prev_log_number_ = false;
+  has_next_file_number_ = false;
+  has_last_sequence_ = false;
+  has_max_column_family_ = false;
+  deleted_files_.clear();
+  new_files_.clear();
+  column_family_ = 0;
+  is_column_family_add_ = 0;
+  is_column_family_drop_ = 0;
+  column_family_name_.clear();
+}
+
+void VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_comparator_) {
+    PutVarint32(dst, kComparator);
+    PutLengthPrefixedSlice(dst, comparator_);
+  }
+  if (has_log_number_) {
+    PutVarint32(dst, kLogNumber);
+    PutVarint64(dst, log_number_);
+  }
+  if (has_prev_log_number_) {
+    PutVarint32(dst, kPrevLogNumber);
+    PutVarint64(dst, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    PutVarint32(dst, kNextFileNumber);
+    PutVarint64(dst, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    PutVarint32(dst, kLastSequence);
+    PutVarint64(dst, last_sequence_);
+  }
+  if (has_max_column_family_) {
+    PutVarint32(dst, kMaxColumnFamily);
+    PutVarint32(dst, max_column_family_);
+  }
+
+  for (const auto& deleted : deleted_files_) {
+    PutVarint32(dst, kDeletedFile);
+    PutVarint32(dst, deleted.first /* level */);
+    PutVarint64(dst, deleted.second /* file number */);
+  }
+
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    PutVarint32(dst, kNewFile2);
+    PutVarint32(dst, new_files_[i].first);  // level
+    PutVarint64(dst, f.number);
+    PutVarint64(dst, f.file_size);
+    PutLengthPrefixedSlice(dst, f.smallest.Encode());
+    PutLengthPrefixedSlice(dst, f.largest.Encode());
+    PutVarint64(dst, f.smallest_seqno);
+    PutVarint64(dst, f.largest_seqno);
+  }
+
+  // 0 is default and does not need to be explicitly written
+  if (column_family_ != 0) {
+    PutVarint32(dst, kColumnFamily);
+    PutVarint32(dst, column_family_);
+  }
+
+  if (is_column_family_add_) {
+    PutVarint32(dst, kColumnFamilyAdd);
+    PutLengthPrefixedSlice(dst, Slice(column_family_name_));
+  }
+
+  if (is_column_family_drop_) {
+    PutVarint32(dst, kColumnFamilyDrop);
+  }
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+  Slice str;
+  if (GetLengthPrefixedSlice(input, &str)) {
+    dst->DecodeFrom(str);
+    return dst->Valid();
+  } else {
+    return false;
+  }
+}
+
+bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
+  uint32_t v;
+  if (GetVarint32(input, &v)) {
+    *level = v;
+    if (max_level_ < *level) {
+      max_level_ = *level;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+  Clear();
+  Slice input = src;
+  const char* msg = nullptr;
+  uint32_t tag;
+
+  // Temporary storage for parsing
+  int level;
+  uint64_t number;
+  FileMetaData f;
+  Slice str;
+  InternalKey key;
+
+  while (msg == nullptr && GetVarint32(&input, &tag)) {
+    switch (tag) {
+      case kComparator:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          comparator_ = str.ToString();
+          has_comparator_ = true;
+        } else {
+          msg = "comparator name";
+        }
+        break;
+
+      case kLogNumber:
+        if (GetVarint64(&input, &log_number_)) {
+          has_log_number_ = true;
+        } else {
+          msg = "log number";
+        }
+        break;
+
+      case kPrevLogNumber:
+        if (GetVarint64(&input, &prev_log_number_)) {
+          has_prev_log_number_ = true;
+        } else {
+          msg = "previous log number";
+        }
+        break;
+
+      case kNextFileNumber:
+        if (GetVarint64(&input, &next_file_number_)) {
+          has_next_file_number_ = true;
+        } else {
+          msg = "next file number";
+        }
+        break;
+
+      case kLastSequence:
+        if (GetVarint64(&input, &last_sequence_)) {
+          has_last_sequence_ = true;
+        } else {
+          msg = "last sequence number";
+        }
+        break;
+
+      case kMaxColumnFamily:
+        if (GetVarint32(&input, &max_column_family_)) {
+          has_max_column_family_ = true;
+        } else {
+          msg = "max column family";
+        }
+        break;
+
+      case kCompactPointer:
+        if (GetLevel(&input, &level, &msg) &&
+            GetInternalKey(&input, &key)) {
+          // we don't use compact pointers anymore,
+          // but we should not fail if they are still
+          // in manifest
+        } else {
+          if (!msg) {
+            msg = "compaction pointer";
+          }
+        }
+        break;
+
+      case kDeletedFile:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &number)) {
+          deleted_files_.insert(std::make_pair(level, number));
+        } else {
+          if (!msg) {
+            msg = "deleted file";
+          }
+        }
+        break;
+
+      case kNewFile:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest)) {
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file entry";
+          }
+        }
+        break;
+
+      case kNewFile2:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &f.smallest_seqno) &&
+            GetVarint64(&input, &f.largest_seqno) ) {
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file2 entry";
+          }
+        }
+        break;
+
+      case kColumnFamily:
+        if (!GetVarint32(&input, &column_family_)) {
+          if (!msg) {
+            msg = "set column family id";
+          }
+        }
+        break;
+
+      case kColumnFamilyAdd:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          is_column_family_add_ = true;
+          column_family_name_ = str.ToString();
+        } else {
+          if (!msg) {
+            msg = "column family add";
+          }
+        }
+        break;
+
+      case kColumnFamilyDrop:
+        is_column_family_drop_ = true;
+        break;
+
+      default:
+        msg = "unknown tag";
+        break;
+    }
+  }
+
+  if (msg == nullptr && !input.empty()) {
+    msg = "invalid tag";
+  }
+
+  Status result;
+  if (msg != nullptr) {
+    result = Status::Corruption("VersionEdit", msg);
+  }
+  return result;
+}
+
+std::string VersionEdit::DebugString(bool hex_key) const {
+  std::string r;
+  r.append("VersionEdit {");
+  if (has_comparator_) {
+    r.append("\n  Comparator: ");
+    r.append(comparator_);
+  }
+  if (has_log_number_) {
+    r.append("\n  LogNumber: ");
+    AppendNumberTo(&r, log_number_);
+  }
+  if (has_prev_log_number_) {
+    r.append("\n  PrevLogNumber: ");
+    AppendNumberTo(&r, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    r.append("\n  NextFile: ");
+    AppendNumberTo(&r, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    r.append("\n  LastSeq: ");
+    AppendNumberTo(&r, last_sequence_);
+  }
+  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+       iter != deleted_files_.end();
+       ++iter) {
+    r.append("\n  DeleteFile: ");
+    AppendNumberTo(&r, iter->first);
+    r.append(" ");
+    AppendNumberTo(&r, iter->second);
+  }
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    r.append("\n  AddFile: ");
+    AppendNumberTo(&r, new_files_[i].first);
+    r.append(" ");
+    AppendNumberTo(&r, f.number);
+    r.append(" ");
+    AppendNumberTo(&r, f.file_size);
+    r.append(" ");
+    r.append(f.smallest.DebugString(hex_key));
+    r.append(" .. ");
+    r.append(f.largest.DebugString(hex_key));
+  }
+  r.append("\n  ColumnFamily: ");
+  AppendNumberTo(&r, column_family_);
+  if (is_column_family_add_) {
+    r.append("\n  ColumnFamilyAdd: ");
+    r.append(column_family_name_);
+  }
+  if (is_column_family_drop_) {
+    r.append("\n  ColumnFamilyDrop");
+  }
+  if (has_max_column_family_) {
+    r.append("\n  MaxColumnFamily: ");
+    AppendNumberTo(&r, max_column_family_);
+  }
+  r.append("\n}\n");
+  return r;
+}
+
+}  // namespace rocksdb
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -0,0 +1,176 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+#include "rocksdb/cache.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+class VersionSet;
+
+struct FileMetaData {
+  int refs;
+  int allowed_seeks;          // Seeks allowed until compaction
+  uint64_t number;
+  uint64_t file_size;         // File size in bytes
+  InternalKey smallest;       // Smallest internal key served by table
+  InternalKey largest;        // Largest internal key served by table
+  bool being_compacted;       // Is this file undergoing compaction?
+  SequenceNumber smallest_seqno;// The smallest seqno in this file
+  SequenceNumber largest_seqno; // The largest seqno in this file
+
+  // Needs to be disposed when refs becomes 0.
+  Cache::Handle* table_reader_handle;
+  // Table reader in table_reader_handle
+  TableReader* table_reader;
+
+  FileMetaData(uint64_t number, uint64_t file_size)
+      : refs(0),
+        allowed_seeks(1 << 30),
+        number(number),
+        file_size(file_size),
+        being_compacted(false),
+        table_reader_handle(nullptr),
+        table_reader(nullptr) {}
+  FileMetaData() : FileMetaData(0, 0) {}
+};
+
+class VersionEdit {
+ public:
+  VersionEdit() { Clear(); }
+  ~VersionEdit() { }
+
+  void Clear();
+
+  void SetComparatorName(const Slice& name) {
+    has_comparator_ = true;
+    comparator_ = name.ToString();
+  }
+  void SetLogNumber(uint64_t num) {
+    has_log_number_ = true;
+    log_number_ = num;
+  }
+  void SetPrevLogNumber(uint64_t num) {
+    has_prev_log_number_ = true;
+    prev_log_number_ = num;
+  }
+  void SetNextFile(uint64_t num) {
+    has_next_file_number_ = true;
+    next_file_number_ = num;
+  }
+  void SetLastSequence(SequenceNumber seq) {
+    has_last_sequence_ = true;
+    last_sequence_ = seq;
+  }
+  void SetMaxColumnFamily(uint32_t max_column_family) {
+    has_max_column_family_ = true;
+    max_column_family_ = max_column_family;
+  }
+
+  // Add the specified file at the specified number.
+  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  void AddFile(int level, uint64_t file,
+               uint64_t file_size,
+               const InternalKey& smallest,
+               const InternalKey& largest,
+               const SequenceNumber& smallest_seqno,
+               const SequenceNumber& largest_seqno) {
+    assert(smallest_seqno <= largest_seqno);
+    FileMetaData f;
+    f.number = file;
+    f.file_size = file_size;
+    f.smallest = smallest;
+    f.largest = largest;
+    f.smallest_seqno = smallest_seqno;
+    f.largest_seqno = largest_seqno;
+    new_files_.push_back(std::make_pair(level, f));
+  }
+
+  // Delete the specified "file" from the specified "level".
+  void DeleteFile(int level, uint64_t file) {
+    deleted_files_.insert({level, file});
+  }
+
+  // Number of edits
+  int NumEntries() {
+    return new_files_.size() + deleted_files_.size();
+  }
+
+  bool IsColumnFamilyManipulation() {
+    return is_column_family_add_ || is_column_family_drop_;
+  }
+
+  void SetColumnFamily(uint32_t column_family_id) {
+    column_family_ = column_family_id;
+  }
+
+  // set column family ID by calling SetColumnFamily()
+  void AddColumnFamily(const std::string& name) {
+    assert(!is_column_family_drop_);
+    assert(!is_column_family_add_);
+    assert(NumEntries() == 0);
+    is_column_family_add_ = true;
+    column_family_name_ = name;
+  }
+
+  // set column family ID by calling SetColumnFamily()
+  void DropColumnFamily() {
+    assert(!is_column_family_drop_);
+    assert(!is_column_family_add_);
+    assert(NumEntries() == 0);
+    is_column_family_drop_ = true;
+  }
+
+  void EncodeTo(std::string* dst) const;
+  Status DecodeFrom(const Slice& src);
+
+  std::string DebugString(bool hex_key = false) const;
+
+ private:
+  friend class VersionSet;
+
+  typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
+
+  bool GetLevel(Slice* input, int* level, const char** msg);
+
+  int max_level_;
+  std::string comparator_;
+  uint64_t log_number_;
+  uint64_t prev_log_number_;
+  uint64_t next_file_number_;
+  uint32_t max_column_family_;
+  SequenceNumber last_sequence_;
+  bool has_comparator_;
+  bool has_log_number_;
+  bool has_prev_log_number_;
+  bool has_next_file_number_;
+  bool has_last_sequence_;
+  bool has_max_column_family_;
+
+  DeletedFileSet deleted_files_;
+  std::vector<std::pair<int, FileMetaData>> new_files_;
+
+  // Each version edit record should have column_family_id set
+  // If it's not set, it is default (0)
+  uint32_t column_family_;
+  // a version edit can be either column_family add or
+  // column_family drop. If it's column family add,
+  // it also includes column family name.
+  bool is_column_family_drop_;
+  bool is_column_family_add_;
+  std::string column_family_name_;
+};
+
+}  // namespace rocksdb
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -0,0 +1,65 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  parsed.EncodeTo(&encoded2);
+  ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest { };
+
+TEST(VersionEditTest, EncodeDecode) {
+  static const uint64_t kBig = 1ull << 50;
+
+  VersionEdit edit;
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit);
+    edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
+                 InternalKey("foo", kBig + 500 + i, kTypeValue),
+                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
+                 kBig + 500 + i,
+                 kBig + 600 + i);
+    edit.DeleteFile(4, kBig + 700 + i);
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+}
+
+TEST(VersionEditTest, ColumnFamilyTest) {
+  VersionEdit edit;
+  edit.SetColumnFamily(2);
+  edit.AddColumnFamily("column_family");
+  edit.SetMaxColumnFamily(5);
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetColumnFamily(3);
+  edit.DropColumnFamily();
+  TestEncodeDecode(edit);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/version_set.cc
+++ b/db/version_set.cc
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -0,0 +1,499 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions.  The
+// newest version is called "current".  Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level.  The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+#include <deque>
+#include <atomic>
+#include <limits>
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "db/table_cache.h"
+#include "db/compaction.h"
+#include "db/compaction_picker.h"
+#include "db/column_family.h"
+#include "db/log_reader.h"
+#include "db/file_indexer.h"
+
+namespace rocksdb {
+
+namespace log { class Writer; }
+
+class Compaction;
+class CompactionPicker;
+class Iterator;
+class LogBuffer;
+class LookupKey;
+class MemTable;
+class Version;
+class VersionSet;
+class MergeContext;
+class ColumnFamilyData;
+class ColumnFamilySet;
+class TableCache;
+class MergeIteratorBuilder;
+
+// Return the smallest index i such that files[i]->largest >= key.
+// Return files.size() if there is no such file.
+// REQUIRES: "files" contains a sorted list of non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+                    const std::vector<FileMetaData*>& files,
+                    const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==nullptr represents a key smaller than all keys in the DB.
+// largest==nullptr represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
+//           in sorted order.
+extern bool SomeFileOverlapsRange(
+    const InternalKeyComparator& icmp,
+    bool disjoint_sorted_files,
+    const std::vector<FileMetaData*>& files,
+    const Slice* smallest_user_key,
+    const Slice* largest_user_key);
+
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
+                    std::vector<Iterator*>* iters);
+
+  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
+                    MergeIteratorBuilder* merger_iter_builder);
+
+  // Lookup the value for key.  If found, store it in *val and
+  // return OK.  Else return a non-OK status.  Fills *stats.
+  // Uses *operands to store merge_operator operations to apply later
+  // REQUIRES: lock is not held
+  struct GetStats {
+    FileMetaData* seek_file;
+    int seek_file_level;
+  };
+  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
+           Status* status, MergeContext* merge_context, GetStats* stats,
+           bool* value_found = nullptr);
+
+  // Adds "stats" into the current state.  Returns true if a new
+  // compaction may need to be triggered, false otherwise.
+  // REQUIRES: lock is held
+  bool UpdateStats(const GetStats& stats);
+
+  // Updates internal structures that keep track of compaction scores
+  // We use compaction scores to figure out which compaction to do next
+  // REQUIRES: If Version is not yet saved to current_, it can be called without
+  // a lock. Once a version is saved to current_, call only with mutex held
+  void ComputeCompactionScore(std::vector<uint64_t>& size_being_compacted);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  // Decrease reference count. Delete the object if no reference left
+  // and return true. Otherwise, return false.
+  bool Unref();
+
+  // Returns true iff some level needs a compaction.
+  bool NeedsCompaction() const;
+
+  // Returns the maxmimum compaction score for levels 1 to max
+  double MaxCompactionScore() const { return max_compaction_score_; }
+
+  // See field declaration
+  int MaxCompactionScoreLevel() const { return max_compaction_score_level_; }
+
+  void GetOverlappingInputs(
+      int level,
+      const InternalKey* begin,         // nullptr means before all keys
+      const InternalKey* end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index = -1,              // index of overlap file
+      int* file_index = nullptr);          // return index of overlap file
+
+  void GetOverlappingInputsBinarySearch(
+      int level,
+      const Slice& begin,         // nullptr means before all keys
+      const Slice& end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index,             // index of overlap file
+      int* file_index);           // return index of overlap file
+
+  void ExtendOverlappingInputs(
+      int level,
+      const Slice& begin,         // nullptr means before all keys
+      const Slice& end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      unsigned int index);                 // start extending from this index
+
+  // Returns true iff some file in the specified level overlaps
+  // some part of [*smallest_user_key,*largest_user_key].
+  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+  // largest_user_key==NULL represents a key largest than all keys in the DB.
+  bool OverlapInLevel(int level,
+                      const Slice* smallest_user_key,
+                      const Slice* largest_user_key);
+
+  // Returns true iff the first or last file in inputs contains
+  // an overlapping user key to the file "just outside" of it (i.e.
+  // just after the last file, or just before the first file)
+  // REQUIRES: "*inputs" is a sorted list of non-overlapping files
+  bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
+                             int level);
+
+
+  // Return the level at which we should place a new memtable compaction
+  // result that covers the range [smallest_user_key,largest_user_key].
+  int PickLevelForMemTableOutput(const Slice& smallest_user_key,
+                                 const Slice& largest_user_key);
+
+  int NumberLevels() const { return num_levels_; }
+
+  // REQUIRES: lock is held
+  int NumLevelFiles(int level) const { return files_[level].size(); }
+
+  // Return the combined file size of all files at the specified level.
+  int64_t NumLevelBytes(int level) const;
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[100];
+  };
+  struct FileSummaryStorage {
+    char buffer[1000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t MaxNextLevelOverlappingBytes();
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFiles(std::set<uint64_t>* live);
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false) const;
+
+  // Returns the version nuber of this version
+  uint64_t GetVersionNumber() const { return version_number_; }
+
+  // REQUIRES: lock is held
+  // On success, *props will be populated with all SSTables' table properties.
+  // The keys of `props` are the sst file name, the values of `props` are the
+  // tables' propertis, represented as shared_ptr.
+  Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
+
+  // used to sort files by size
+  struct Fsize {
+    int index;
+    FileMetaData* file;
+  };
+
+ private:
+  friend class Compaction;
+  friend class VersionSet;
+  friend class DBImpl;
+  friend class ColumnFamilyData;
+  friend class CompactionPicker;
+  friend class LevelCompactionPicker;
+  friend class UniversalCompactionPicker;
+  friend class FIFOCompactionPicker;
+  friend class ForwardIterator;
+
+  class LevelFileNumIterator;
+  class LevelFileIteratorState;
+
+  bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter,
+                      const Slice& internal_prefix) const;
+
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize();
+
+  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
+  const InternalKeyComparator* internal_comparator_;
+  const Comparator* user_comparator_;
+  TableCache* table_cache_;
+  const MergeOperator* merge_operator_;
+  Logger* info_log_;
+  Statistics* db_statistics_;
+  VersionSet* vset_;            // VersionSet to which this Version belongs
+  Version* next_;               // Next version in linked list
+  Version* prev_;               // Previous version in linked list
+  int refs_;                    // Number of live refs to this version
+  int num_levels_;              // Number of levels
+
+  // List of files per level, files in each level are arranged
+  // in increasing order of keys
+  std::vector<FileMetaData*>* files_;
+
+  // A list for the same set of files that are stored in files_,
+  // but files in each level are now sorted based on file
+  // size. The file with the largest size is at the front.
+  // This vector stores the index of the file from files_.
+  std::vector<std::vector<int>> files_by_size_;
+
+  // An index into files_by_size_ that specifies the first
+  // file that is not yet compacted
+  std::vector<int> next_file_to_compact_by_size_;
+
+  // Only the first few entries of files_by_size_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const int number_of_files_to_sort_ = 50;
+
+  // Next file to compact based on seek stats.
+  FileMetaData* file_to_compact_;
+  int file_to_compact_level_;
+
+  // Level that should be compacted next and its compaction score.
+  // Score < 1 means compaction is not strictly needed.  These fields
+  // are initialized by Finalize().
+  // The most critical level to be compacted is listed first
+  // These are used to pick the best compaction level
+  std::vector<double> compaction_score_;
+  std::vector<int> compaction_level_;
+  double max_compaction_score_; // max score in l1 to ln-1
+  int max_compaction_score_level_; // level on which max score occurs
+
+  // A version number that uniquely represents this version. This is
+  // used for debugging and logging purposes only.
+  uint64_t version_number_;
+
+  Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
+  FileIndexer file_indexer_;
+
+  ~Version();
+
+  // re-initializes the index that is used to offset into files_by_size_
+  // to find the next compaction candidate file.
+  void ResetNextCompactionIndex(int level) {
+    next_file_to_compact_by_size_[level] = 0;
+  }
+
+  // No copying allowed
+  Version(const Version&);
+  void operator=(const Version&);
+};
+
+class VersionSet {
+ public:
+  VersionSet(const std::string& dbname, const DBOptions* options,
+             const EnvOptions& storage_options, Cache* table_cache);
+  ~VersionSet();
+
+  // Apply *edit to the current version to form a new descriptor that
+  // is both saved to persistent state and installed as the new
+  // current version.  Will release *mu while actually writing to the file.
+  // column_family_options has to be set if edit is column family add
+  // REQUIRES: *mu is held on entry.
+  // REQUIRES: no other thread concurrently calls LogAndApply()
+  Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit,
+                     port::Mutex* mu, Directory* db_directory = nullptr,
+                     bool new_descriptor_log = false,
+                     const ColumnFamilyOptions* column_family_options =
+                         nullptr);
+
+  // Recover the last saved descriptor from persistent storage.
+  // If read_only == true, Recover() will not complain if some column families
+  // are not opened
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only = false);
+
+  // Reads a manifest file and returns a list of column families in
+  // column_families.
+  static Status ListColumnFamilies(std::vector<std::string>* column_families,
+                                   const std::string& dbname, Env* env);
+
+#ifndef ROCKSDB_LITE
+  // Try to reduce the number of levels. This call is valid when
+  // only one level from the new max level to the old
+  // max level containing files.
+  // The call is static, since number of levels is immutable during
+  // the lifetime of a RocksDB instance. It reduces number of levels
+  // in a DB by applying changes to manifest.
+  // For example, a db currently has 7 levels [0-6], and a call to
+  // to reduce to 5 [0-4] can only be executed when only one level
+  // among [4-6] contains files.
+  static Status ReduceNumberOfLevels(const std::string& dbname,
+                                     const Options* options,
+                                     const EnvOptions& storage_options,
+                                     int new_levels);
+
+  // printf contents (for debugging)
+  Status DumpManifest(Options& options, std::string& manifestFileName,
+                      bool verbose, bool hex = false);
+
+#endif  // ROCKSDB_LITE
+
+  // Return the current manifest file number
+  uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+
+  uint64_t PendingManifestFileNumber() const {
+    return pending_manifest_file_number_;
+  }
+
+  // Allocate and return a new file number
+  uint64_t NewFileNumber() { return next_file_number_++; }
+
+  // Arrange to reuse "file_number" unless a newer file number has
+  // already been allocated.
+  // REQUIRES: "file_number" was returned by a call to NewFileNumber().
+  void ReuseFileNumber(uint64_t file_number) {
+    if (next_file_number_ == file_number + 1) {
+      next_file_number_ = file_number;
+    }
+  }
+
+  // Return the last sequence number.
+  uint64_t LastSequence() const {
+    return last_sequence_.load(std::memory_order_acquire);
+  }
+
+  // Set the last sequence number to s.
+  void SetLastSequence(uint64_t s) {
+    assert(s >= last_sequence_);
+    last_sequence_.store(s, std::memory_order_release);
+  }
+
+  // Mark the specified file number as used.
+  void MarkFileNumberUsed(uint64_t number);
+
+  // Return the log file number for the log file that is currently
+  // being compacted, or zero if there is no such log file.
+  uint64_t PrevLogNumber() const { return prev_log_number_; }
+
+  // Returns the minimum log number such that all
+  // log numbers less than or equal to it can be deleted
+  uint64_t MinLogNumber() const {
+    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *column_family_set_) {
+      if (min_log_num > cfd->GetLogNumber()) {
+        min_log_num = cfd->GetLogNumber();
+      }
+    }
+    return min_log_num;
+  }
+
+  // Create an iterator that reads over the compaction inputs for "*c".
+  // The caller should delete the iterator when no longer needed.
+  Iterator* MakeInputIterator(Compaction* c);
+
+  // Add all files listed in any live version to *live.
+  void AddLiveFiles(std::vector<uint64_t>* live_list);
+
+  // Return the approximate offset in the database of the data for
+  // "key" as of version "v".
+  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+
+  // Return the size of the current manifest file
+  uint64_t ManifestFileSize() const { return manifest_file_size_; }
+
+  // verify that the files that we started with for a compaction
+  // still exist in the current version and in the same original level.
+  // This ensures that a concurrent compaction did not erroneously
+  // pick the same files to compact.
+  bool VerifyCompactionFileConsistency(Compaction* c);
+
+  Status GetMetadataForFile(uint64_t number, int* filelevel,
+                            FileMetaData** metadata, ColumnFamilyData** cfd);
+
+  void GetLiveFilesMetaData(
+    std::vector<LiveFileMetaData> *metadata);
+
+  void GetObsoleteFiles(std::vector<FileMetaData*>* files);
+
+  ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+
+ private:
+  class Builder;
+  struct ManifestWriter;
+
+  friend class Version;
+
+  struct LogReporter : public log::Reader::Reporter {
+    Status* status;
+    virtual void Corruption(size_t bytes, const Status& s) {
+      if (this->status->ok()) *this->status = s;
+    }
+  };
+
+  // Save current contents to *log
+  Status WriteSnapshot(log::Writer* log);
+
+  void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
+
+  bool ManifestContains(uint64_t manifest_file_number,
+                        const std::string& record) const;
+
+  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options,
+                                       VersionEdit* edit);
+
+  std::unique_ptr<ColumnFamilySet> column_family_set_;
+
+  Env* const env_;
+  const std::string dbname_;
+  const DBOptions* const options_;
+  uint64_t next_file_number_;
+  uint64_t manifest_file_number_;
+  uint64_t pending_manifest_file_number_;
+  std::atomic<uint64_t> last_sequence_;
+  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
+
+  // Opened lazily
+  unique_ptr<log::Writer> descriptor_log_;
+
+  // generates a increasing version number for every new version
+  uint64_t current_version_number_;
+
+  // Queue of writers to the manifest file
+  std::deque<ManifestWriter*> manifest_writers_;
+
+  // Current size of manifest file
+  uint64_t manifest_file_size_;
+
+  std::vector<FileMetaData*> obsolete_files_;
+
+  // storage options for all reads and writes except compactions
+  const EnvOptions& storage_options_;
+
+  // storage options used for compactions. This is a copy of
+  // storage_options_ but with readaheads set to readahead_compactions_.
+  const EnvOptions storage_options_compactions_;
+
+  // No copying allowed
+  VersionSet(const VersionSet&);
+  void operator=(const VersionSet&);
+
+  void LogAndApplyCFHelper(VersionEdit* edit);
+  void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v,
+                         VersionEdit* edit, port::Mutex* mu);
+};
+
+}  // namespace rocksdb
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -0,0 +1,184 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class FindFileTest {
+ public:
+  std::vector<FileMetaData*> files_;
+  bool disjoint_sorted_files_;
+
+  FindFileTest() : disjoint_sorted_files_(true) { }
+
+  ~FindFileTest() {
+    for (unsigned int i = 0; i < files_.size(); i++) {
+      delete files_[i];
+    }
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    FileMetaData* f = new FileMetaData;
+    f->number = files_.size() + 1;
+    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
+    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    files_.push_back(f);
+  }
+
+  int Find(const char* key) {
+    InternalKey target(key, 100, kTypeValue);
+    InternalKeyComparator cmp(BytewiseComparator());
+    return FindFile(cmp, files_, target.Encode());
+  }
+
+  bool Overlaps(const char* smallest, const char* largest) {
+    InternalKeyComparator cmp(BytewiseComparator());
+    Slice s(smallest != nullptr ? smallest : "");
+    Slice l(largest != nullptr ? largest : "");
+    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
+                                 (smallest != nullptr ? &s : nullptr),
+                                 (largest != nullptr ? &l : nullptr));
+  }
+};
+
+TEST(FindFileTest, Empty) {
+  ASSERT_EQ(0, Find("foo"));
+  ASSERT_TRUE(! Overlaps("a", "z"));
+  ASSERT_TRUE(! Overlaps(nullptr, "z"));
+  ASSERT_TRUE(! Overlaps("a", nullptr));
+  ASSERT_TRUE(! Overlaps(nullptr, nullptr));
+}
+
+TEST(FindFileTest, Single) {
+  Add("p", "q");
+  ASSERT_EQ(0, Find("a"));
+  ASSERT_EQ(0, Find("p"));
+  ASSERT_EQ(0, Find("p1"));
+  ASSERT_EQ(0, Find("q"));
+  ASSERT_EQ(1, Find("q1"));
+  ASSERT_EQ(1, Find("z"));
+
+  ASSERT_TRUE(! Overlaps("a", "b"));
+  ASSERT_TRUE(! Overlaps("z1", "z2"));
+  ASSERT_TRUE(Overlaps("a", "p"));
+  ASSERT_TRUE(Overlaps("a", "q"));
+  ASSERT_TRUE(Overlaps("a", "z"));
+  ASSERT_TRUE(Overlaps("p", "p1"));
+  ASSERT_TRUE(Overlaps("p", "q"));
+  ASSERT_TRUE(Overlaps("p", "z"));
+  ASSERT_TRUE(Overlaps("p1", "p2"));
+  ASSERT_TRUE(Overlaps("p1", "z"));
+  ASSERT_TRUE(Overlaps("q", "q"));
+  ASSERT_TRUE(Overlaps("q", "q1"));
+
+  ASSERT_TRUE(! Overlaps(nullptr, "j"));
+  ASSERT_TRUE(! Overlaps("r", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "p"));
+  ASSERT_TRUE(Overlaps(nullptr, "p1"));
+  ASSERT_TRUE(Overlaps("q", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+}
+
+
+TEST(FindFileTest, Multiple) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_EQ(0, Find("100"));
+  ASSERT_EQ(0, Find("150"));
+  ASSERT_EQ(0, Find("151"));
+  ASSERT_EQ(0, Find("199"));
+  ASSERT_EQ(0, Find("200"));
+  ASSERT_EQ(1, Find("201"));
+  ASSERT_EQ(1, Find("249"));
+  ASSERT_EQ(1, Find("250"));
+  ASSERT_EQ(2, Find("251"));
+  ASSERT_EQ(2, Find("299"));
+  ASSERT_EQ(2, Find("300"));
+  ASSERT_EQ(2, Find("349"));
+  ASSERT_EQ(2, Find("350"));
+  ASSERT_EQ(3, Find("351"));
+  ASSERT_EQ(3, Find("400"));
+  ASSERT_EQ(3, Find("450"));
+  ASSERT_EQ(4, Find("451"));
+
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("251", "299"));
+  ASSERT_TRUE(! Overlaps("451", "500"));
+  ASSERT_TRUE(! Overlaps("351", "399"));
+
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST(FindFileTest, MultipleNullBoundaries) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_TRUE(! Overlaps(nullptr, "149"));
+  ASSERT_TRUE(! Overlaps("451", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "150"));
+  ASSERT_TRUE(Overlaps(nullptr, "199"));
+  ASSERT_TRUE(Overlaps(nullptr, "200"));
+  ASSERT_TRUE(Overlaps(nullptr, "201"));
+  ASSERT_TRUE(Overlaps(nullptr, "400"));
+  ASSERT_TRUE(Overlaps(nullptr, "800"));
+  ASSERT_TRUE(Overlaps("100", nullptr));
+  ASSERT_TRUE(Overlaps("200", nullptr));
+  ASSERT_TRUE(Overlaps("449", nullptr));
+  ASSERT_TRUE(Overlaps("450", nullptr));
+}
+
+TEST(FindFileTest, OverlapSequenceChecks) {
+  Add("200", "200", 5000, 3000);
+  ASSERT_TRUE(! Overlaps("199", "199"));
+  ASSERT_TRUE(! Overlaps("201", "300"));
+  ASSERT_TRUE(Overlaps("200", "200"));
+  ASSERT_TRUE(Overlaps("190", "200"));
+  ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST(FindFileTest, OverlappingFiles) {
+  Add("150", "600");
+  Add("400", "500");
+  disjoint_sorted_files_ = false;
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("601", "700"));
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+  ASSERT_TRUE(Overlaps("450", "700"));
+  ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/Show More
+++ b/Show More