Squashed 'src/ripple/rocksdb/rocksdb/' content from commit 56589ab

git-subtree-dir: src/ripple/rocksdb/rocksdb git-subtree-split: 56589ab81f6827ff7402e31b24a6d548f29a524f
2025-12-06 17:27:52 +00:00 · 2013-11-21 16:24:10 -08:00
commit b156a49cff
262 changed files with 63141 additions and 0 deletions
--- a/include/rocksdb/arena.h
+++ b/include/rocksdb/arena.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Arena class defines memory allocation methods. It's used by memtable and
+// skiplist.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
+#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
+
+#include <limits>
+#include <memory>
+
+namespace rocksdb {
+
+class Arena {
+ public:
+  Arena() {};
+  virtual ~Arena() {};
+
+  // Return a pointer to a newly allocated memory block of "bytes" bytes.
+  virtual char* Allocate(size_t bytes) = 0;
+
+  // Allocate memory with the normal alignment guarantees provided by malloc.
+  virtual char* AllocateAligned(size_t bytes) = 0;
+
+  // Returns an estimate of the total memory used by arena.
+  virtual const size_t ApproximateMemoryUsage() = 0;
+
+  // Returns the total number of bytes in all blocks allocated so far.
+  virtual const size_t MemoryAllocatedBytes() = 0;
+
+ private:
+  // No copying allowed
+  Arena(const Arena&);
+  void operator=(const Arena&);
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ARENA_H_
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -0,0 +1,285 @@
+/*  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+  This source code is licensed under the BSD-style license found in the
+  LICENSE file in the root directory of this source tree. An additional grant
+  of patent rights can be found in the PATENTS file in the same directory.
+ Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+  Use of this source code is governed by a BSD-style license that can be
+  found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+  C bindings for leveldb.  May be useful as a stable ABI that can be
+  used by programs that keep leveldb in a shared library, or for
+  a JNI api.
+
+  Does not support:
+  . getters for the option types
+  . custom comparators that implement key shortening
+  . capturing post-write-snapshot
+  . custom iter, db, env, cache implementations using just the C bindings
+
+  Some conventions:
+
+  (1) We expose just opaque struct pointers and functions to clients.
+  This allows us to change internal representations without having to
+  recompile clients.
+
+  (2) For simplicity, there is no equivalent to the Slice type.  Instead,
+  the caller has to pass the pointer and length as separate
+  arguments.
+
+  (3) Errors are represented by a null-terminated c string.  NULL
+  means no error.  All operations that can raise an error are passed
+  a "char** errptr" as the last argument.  One of the following must
+  be true on entry:
+     *errptr == NULL
+     *errptr points to a malloc()ed null-terminated error message
+  On success, a leveldb routine leaves *errptr unchanged.
+  On failure, leveldb frees the old value of *errptr and
+  set *errptr to a malloc()ed error message.
+
+  (4) Bools have the type unsigned char (0 == false; rest == true)
+
+  (5) All of the pointer arguments must be non-NULL.
+*/
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
+#define STORAGE_ROCKSDB_INCLUDE_C_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Exported types */
+
+typedef struct leveldb_t               leveldb_t;
+typedef struct leveldb_cache_t         leveldb_cache_t;
+typedef struct leveldb_comparator_t    leveldb_comparator_t;
+typedef struct leveldb_env_t           leveldb_env_t;
+typedef struct leveldb_filelock_t      leveldb_filelock_t;
+typedef struct leveldb_filterpolicy_t  leveldb_filterpolicy_t;
+typedef struct leveldb_iterator_t      leveldb_iterator_t;
+typedef struct leveldb_logger_t        leveldb_logger_t;
+typedef struct leveldb_options_t       leveldb_options_t;
+typedef struct leveldb_randomfile_t    leveldb_randomfile_t;
+typedef struct leveldb_readoptions_t   leveldb_readoptions_t;
+typedef struct leveldb_seqfile_t       leveldb_seqfile_t;
+typedef struct leveldb_snapshot_t      leveldb_snapshot_t;
+typedef struct leveldb_writablefile_t  leveldb_writablefile_t;
+typedef struct leveldb_writebatch_t    leveldb_writebatch_t;
+typedef struct leveldb_writeoptions_t  leveldb_writeoptions_t;
+
+/* DB operations */
+
+extern leveldb_t* leveldb_open(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr);
+
+extern void leveldb_close(leveldb_t* db);
+
+extern void leveldb_put(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr);
+
+extern void leveldb_delete(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    char** errptr);
+
+extern void leveldb_write(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    leveldb_writebatch_t* batch,
+    char** errptr);
+
+/* Returns NULL if not found.  A malloc()ed array otherwise.
+   Stores the length of the array in *vallen. */
+extern char* leveldb_get(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr);
+
+extern leveldb_iterator_t* leveldb_create_iterator(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options);
+
+extern const leveldb_snapshot_t* leveldb_create_snapshot(
+    leveldb_t* db);
+
+extern void leveldb_release_snapshot(
+    leveldb_t* db,
+    const leveldb_snapshot_t* snapshot);
+
+/* Returns NULL if property name is unknown.
+   Else returns a pointer to a malloc()-ed null-terminated value. */
+extern char* leveldb_property_value(
+    leveldb_t* db,
+    const char* propname);
+
+extern void leveldb_approximate_sizes(
+    leveldb_t* db,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes);
+
+extern void leveldb_compact_range(
+    leveldb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+/* Management operations */
+
+extern void leveldb_destroy_db(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr);
+
+extern void leveldb_repair_db(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr);
+
+/* Iterator */
+
+extern void leveldb_iter_destroy(leveldb_iterator_t*);
+extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*);
+extern void leveldb_iter_seek_to_first(leveldb_iterator_t*);
+extern void leveldb_iter_seek_to_last(leveldb_iterator_t*);
+extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen);
+extern void leveldb_iter_next(leveldb_iterator_t*);
+extern void leveldb_iter_prev(leveldb_iterator_t*);
+extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
+extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
+extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
+
+/* Write batch */
+
+extern leveldb_writebatch_t* leveldb_writebatch_create();
+extern void leveldb_writebatch_destroy(leveldb_writebatch_t*);
+extern void leveldb_writebatch_clear(leveldb_writebatch_t*);
+extern void leveldb_writebatch_put(
+    leveldb_writebatch_t*,
+    const char* key, size_t klen,
+    const char* val, size_t vlen);
+extern void leveldb_writebatch_delete(
+    leveldb_writebatch_t*,
+    const char* key, size_t klen);
+extern void leveldb_writebatch_iterate(
+    leveldb_writebatch_t*,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen));
+
+/* Options */
+
+extern leveldb_options_t* leveldb_options_create();
+extern void leveldb_options_destroy(leveldb_options_t*);
+extern void leveldb_options_set_comparator(
+    leveldb_options_t*,
+    leveldb_comparator_t*);
+extern void leveldb_options_set_compression_per_level(
+  leveldb_options_t* opt,
+  int* level_values,
+  size_t num_levels);
+extern void leveldb_options_set_filter_policy(
+    leveldb_options_t*,
+    leveldb_filterpolicy_t*);
+extern void leveldb_options_set_create_if_missing(
+    leveldb_options_t*, unsigned char);
+extern void leveldb_options_set_error_if_exists(
+    leveldb_options_t*, unsigned char);
+extern void leveldb_options_set_paranoid_checks(
+    leveldb_options_t*, unsigned char);
+extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
+extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
+extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
+extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
+extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
+extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
+extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
+extern void leveldb_options_set_compression_options(
+    leveldb_options_t* opt, int w_bits, int level, int strategy);
+
+enum {
+  leveldb_no_compression = 0,
+  leveldb_snappy_compression = 1
+};
+extern void leveldb_options_set_compression(leveldb_options_t*, int);
+
+/* Comparator */
+
+extern leveldb_comparator_t* leveldb_comparator_create(
+    void* state,
+    void (*destructor)(void*),
+    int (*compare)(
+        void*,
+        const char* a, size_t alen,
+        const char* b, size_t blen),
+    const char* (*name)(void*));
+extern void leveldb_comparator_destroy(leveldb_comparator_t*);
+
+/* Filter policy */
+
+extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*));
+extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
+
+extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
+    int bits_per_key);
+
+/* Read options */
+
+extern leveldb_readoptions_t* leveldb_readoptions_create();
+extern void leveldb_readoptions_destroy(leveldb_readoptions_t*);
+extern void leveldb_readoptions_set_verify_checksums(
+    leveldb_readoptions_t*,
+    unsigned char);
+extern void leveldb_readoptions_set_fill_cache(
+    leveldb_readoptions_t*, unsigned char);
+extern void leveldb_readoptions_set_snapshot(
+    leveldb_readoptions_t*,
+    const leveldb_snapshot_t*);
+
+/* Write options */
+
+extern leveldb_writeoptions_t* leveldb_writeoptions_create();
+extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*);
+extern void leveldb_writeoptions_set_sync(
+    leveldb_writeoptions_t*, unsigned char);
+
+/* Cache */
+
+extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity);
+extern void leveldb_cache_destroy(leveldb_cache_t* cache);
+
+/* Env */
+
+extern leveldb_env_t* leveldb_create_default_env();
+extern void leveldb_env_destroy(leveldb_env_t*);
+
+#ifdef __cplusplus
+}  /* end extern "C" */
+#endif
+
+#endif  /* STORAGE_ROCKSDB_INCLUDE_C_H_ */
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
+#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
+
+#include <memory>
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+using std::shared_ptr;
+
+class Cache;
+
+// Create a new cache with a fixed size capacity. The cache is sharded
+// to 2^numShardBits shards, by hash of the key. The total capacity
+// is divided and evenly assigned to each shard. Inside each shard,
+// the eviction is done in two passes: first try to free spaces by
+// evicting entries that are among the most least used removeScanCountLimit
+// entries and do not have reference other than by the cache itself, in
+// the least-used order. If not enough space is freed, further free the
+// entries in least used order.
+//
+// The functions without parameter numShardBits and/or removeScanCountLimit
+// use default values. removeScanCountLimit's default value is 0, which
+// means a strict LRU order inside each shard.
+extern shared_ptr<Cache> NewLRUCache(size_t capacity);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
+                                     int removeScanCountLimit);
+
+class Cache {
+ public:
+  Cache() { }
+
+  // Destroys all existing entries by calling the "deleter"
+  // function that was passed to the constructor.
+  virtual ~Cache();
+
+  // Opaque handle to an entry stored in the cache.
+  struct Handle { };
+
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity.
+  //
+  // Returns a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) = 0;
+
+  // If the cache has no mapping for "key", returns nullptr.
+  //
+  // Else return a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  virtual Handle* Lookup(const Slice& key) = 0;
+
+  // Release a mapping returned by a previous Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void Release(Handle* handle) = 0;
+
+  // Return the value encapsulated in a handle returned by a
+  // successful Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void* Value(Handle* handle) = 0;
+
+  // If the cache contains entry for key, erase it.  Note that the
+  // underlying entry will be kept around until all existing handles
+  // to it have been released.
+  virtual void Erase(const Slice& key) = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharing the same cache to partition the key space.  Typically the
+  // client will allocate a new id at startup and prepend the id to
+  // its cache keys.
+  virtual uint64_t NewId() = 0;
+
+  // returns the maximum configured capacity of the cache
+  virtual size_t GetCapacity() = 0;
+
+ private:
+  void LRU_Remove(Handle* e);
+  void LRU_Append(Handle* e);
+  void Unref(Handle* e);
+
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  Cache(const Cache&);
+  void operator=(const Cache&);
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_UTIL_CACHE_H_
--- a/include/rocksdb/compaction_filter.h
+++ b/include/rocksdb/compaction_filter.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+// CompactionFilter allows an application to modify/delete a key-value at
+// the time of compaction.
+
+class CompactionFilter {
+ public:
+
+  // Context information of a compaction run
+  struct Context {
+    // Does this compaction run include all data files
+    bool is_full_compaction;
+  };
+
+  virtual ~CompactionFilter() {}
+
+  // The compaction process invokes this
+  // method for kv that is being compacted. A return value
+  // of false indicates that the kv should be preserved in the
+  // output of this compaction run and a return value of true
+  // indicates that this key-value should be removed from the
+  // output of the compaction.  The application can inspect
+  // the existing value of the key and make decision based on it.
+  //
+  // When the value is to be preserved, the application has the option
+  // to modify the existing_value and pass it back through new_value.
+  // value_changed needs to be set to true in this case.
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const = 0;
+
+  // Returns a name that identifies this compaction filter.
+  // The name will be printed to LOG file on start up for diagnosis.
+  virtual const char* Name() const = 0;
+};
+
+// Each compaction will create a new CompactionFilter allowing the
+// application to know about different campactions
+class CompactionFilterFactory {
+ public:
+  virtual ~CompactionFilterFactory() { };
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+    const CompactionFilter::Context& context) = 0;
+
+  // Returns a name that identifies this compaction filter factory.
+  virtual const char* Name() const = 0;
+};
+
+// Default implementaion of CompactionFilterFactory which does not
+// return any filter
+class DefaultCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  virtual std::unique_ptr<CompactionFilter>
+  CreateCompactionFilter(const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(nullptr);
+  }
+
+  virtual const char* Name() const override {
+    return "DefaultCompactionFilterFactory";
+  }
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database.  A Comparator implementation
+// must be thread-safe since rocksdb may invoke its methods concurrently
+// from multiple threads.
+class Comparator {
+ public:
+  virtual ~Comparator();
+
+  // Three-way comparison.  Returns value:
+  //   < 0 iff "a" < "b",
+  //   == 0 iff "a" == "b",
+  //   > 0 iff "a" > "b"
+  virtual int Compare(const Slice& a, const Slice& b) const = 0;
+
+  // The name of the comparator.  Used to check for comparator
+  // mismatches (i.e., a DB created with one comparator is
+  // accessed using a different comparator.
+  //
+  // The client of this package should switch to a new name whenever
+  // the comparator implementation changes in a way that will cause
+  // the relative ordering of any two keys to change.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Advanced functions: these are used to reduce the space requirements
+  // for internal data structures like index blocks.
+
+  // If *start < limit, changes *start to a short string in [start,limit).
+  // Simple comparator implementations may return with *start unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const = 0;
+
+  // Changes *key to a short string >= *key.
+  // Simple comparator implementations may return with *key unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortSuccessor(std::string* key) const = 0;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering.  The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -0,0 +1,307 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
+#define STORAGE_ROCKSDB_INCLUDE_DB_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <memory>
+#include <vector>
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+
+namespace rocksdb {
+
+using std::unique_ptr;
+
+// Update Makefile if you change these
+static const int kMajorVersion = 2;
+static const int kMinorVersion = 0;
+
+struct Options;
+struct ReadOptions;
+struct WriteOptions;
+struct FlushOptions;
+class WriteBatch;
+
+// Metadata associated with each SST file.
+struct LiveFileMetaData {
+  std::string name;        // Name of the file
+  int level;               // Level at which this file resides.
+  size_t size;             // File size in bytes.
+  std::string smallestkey; // Smallest user defined key in the file.
+  std::string largestkey;  // Largest user defined key in the file.
+  SequenceNumber smallest_seqno; // smallest seqno in file
+  SequenceNumber largest_seqno;  // largest seqno in file
+};
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+class Snapshot {
+ protected:
+  virtual ~Snapshot();
+};
+
+// A range of keys
+struct Range {
+  Slice start;          // Included in the range
+  Slice limit;          // Not included in the range
+
+  Range() { }
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
+};
+
+// A DB is a persistent ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
+class DB {
+ public:
+  // Open the database with the specified "name".
+  // Stores a pointer to a heap-allocated database in *dbptr and returns
+  // OK on success.
+  // Stores nullptr in *dbptr and returns a non-OK status on error.
+  // Caller should delete *dbptr when it is no longer needed.
+  static Status Open(const Options& options,
+                     const std::string& name,
+                     DB** dbptr);
+
+  // Open the database for read only. All DB interfaces
+  // that modify data, like put/delete, will return error.
+  // If the db is opened in read only mode, then no compactions
+  // will happen.
+  static Status OpenForReadOnly(const Options& options,
+      const std::string& name, DB** dbptr,
+      bool error_if_log_file_exist = false);
+
+  DB() { }
+  virtual ~DB();
+
+  // Set the database entry for "key" to "value".
+  // Returns OK on success, and a non-OK status on error.
+  // Note: consider setting options.sync = true.
+  virtual Status Put(const WriteOptions& options,
+                     const Slice& key,
+                     const Slice& value) = 0;
+
+  // Remove the database entry (if any) for "key".  Returns OK on
+  // success, and a non-OK status on error.  It is not an error if "key"
+  // did not exist in the database.
+  // Note: consider setting options.sync = true.
+  virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
+
+  // Merge the database entry for "key" with "value".  Returns OK on success,
+  // and a non-OK status on error. The semantics of this operation is
+  // determined by the user provided merge_operator when opening DB.
+  // Note: consider setting options.sync = true.
+  virtual Status Merge(const WriteOptions& options,
+                       const Slice& key,
+                       const Slice& value) = 0;
+
+  // Apply the specified updates to the database.
+  // Returns OK on success, non-OK on failure.
+  // Note: consider setting options.sync = true.
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+  // If the database contains an entry for "key" store the
+  // corresponding value in *value and return OK.
+  //
+  // If there is no entry for "key" leave *value unchanged and return
+  // a status for which Status::IsNotFound() returns true.
+  //
+  // May return some other Status on an error.
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     std::string* value) = 0;
+
+  // If keys[i] does not exist in the database, then the i'th returned
+  // status will be one for which Status::IsNotFound() is true, and
+  // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+  // the i'th returned status will have Status::ok() true, and (*values)[i]
+  // will store the value associated with keys[i].
+  //
+  // (*values) will always be resized to be the same size as (keys).
+  // Similarly, the number of returned statuses will be the number of keys.
+  // Note: keys will not be "de-duplicated". Duplicate keys will return
+  // duplicate values in order.
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) = 0;
+
+  // If the key definitely does not exist in the database, then this method
+  // returns false, else true. If the caller wants to obtain value when the key
+  // is found in memory, a bool for 'value_found' must be passed. 'value_found'
+  // will be true on return if value has been set properly.
+  // This check is potentially lighter-weight than invoking DB::Get(). One way
+  // to make this lighter weight is to avoid doing any IOs.
+  // Default implementation here returns true and sets 'value_found' to false
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true;
+  }
+
+  // Return a heap-allocated iterator over the contents of the database.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  //
+  // Caller should delete the iterator when it is no longer needed.
+  // The returned iterator should be deleted before this db is deleted.
+  virtual Iterator* NewIterator(const ReadOptions& options) = 0;
+
+  // Return a handle to the current DB state.  Iterators created with
+  // this handle will all observe a stable snapshot of the current DB
+  // state.  The caller must call ReleaseSnapshot(result) when the
+  // snapshot is no longer needed.
+  virtual const Snapshot* GetSnapshot() = 0;
+
+  // Release a previously acquired snapshot.  The caller must not
+  // use "snapshot" after this call.
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+  // DB implementations can export properties about their state
+  // via this method.  If "property" is a valid property understood by this
+  // DB implementation, fills "*value" with its current value and returns
+  // true.  Otherwise returns false.
+  //
+  //
+  // Valid property names include:
+  //
+  //  "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
+  //     where <N> is an ASCII representation of a level number (e.g. "0").
+  //  "rocksdb.stats" - returns a multi-line string that describes statistics
+  //     about the internal operation of the DB.
+  //  "rocksdb.sstables" - returns a multi-line string that describes all
+  //     of the sstables that make up the db contents.
+  virtual bool GetProperty(const Slice& property, std::string* value) = 0;
+
+  // For each i in [0,n-1], store in "sizes[i]", the approximate
+  // file system space used by keys in "[range[i].start .. range[i].limit)".
+  //
+  // Note that the returned sizes measure file system space usage, so
+  // if the user data compresses by a factor of ten, the returned
+  // sizes will be one-tenth the size of the corresponding user data size.
+  //
+  // The results may not include the sizes of recently written data.
+  virtual void GetApproximateSizes(const Range* range, int n,
+                                   uint64_t* sizes) = 0;
+
+  // Compact the underlying storage for the key range [*begin,*end].
+  // In particular, deleted and overwritten versions are discarded,
+  // and the data is rearranged to reduce the cost of operations
+  // needed to access the data.  This operation should typically only
+  // be invoked by users who understand the underlying implementation.
+  //
+  // begin==nullptr is treated as a key before all keys in the database.
+  // end==nullptr is treated as a key after all keys in the database.
+  // Therefore the following call will compact the entire database:
+  //    db->CompactRange(nullptr, nullptr);
+  // Note that after the entire database is compacted, all data are pushed
+  // down to the last level containing any data. If the total data size
+  // after compaction is reduced, that level might not be appropriate for
+  // hosting all the files. In this case, client could set reduce_level
+  // to true, to move the files back to the minimum level capable of holding
+  // the data set or a given level (specified by non-negative target_level).
+  virtual void CompactRange(const Slice* begin, const Slice* end,
+                            bool reduce_level = false,
+                            int target_level = -1) = 0;
+
+  // Number of levels used for this DB.
+  virtual int NumberLevels() = 0;
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.
+  virtual int MaxMemCompactionLevel() = 0;
+
+  // Number of files in level-0 that would stop writes.
+  virtual int Level0StopWriteTrigger() = 0;
+
+  // Flush all mem-table data.
+  virtual Status Flush(const FlushOptions& options) = 0;
+
+  // Prevent file deletions. Compactions will continue to occur,
+  // but no obsolete files will be deleted. Calling this multiple
+  // times have the same effect as calling it once.
+  virtual Status DisableFileDeletions() = 0;
+
+  // Allow compactions to delete obselete files.
+  virtual Status EnableFileDeletions() = 0;
+
+  // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
+
+  // THIS METHOD IS DEPRECATED. Use the GetTableMetaData to get more
+  // detailed information on the live files.
+  // Retrieve the list of all files in the database. The files are
+  // relative to the dbname and are not absolute paths. The valid size of the
+  // manifest file is returned in manifest_file_size. The manifest file is an
+  // ever growing file, but only the portion specified by manifest_file_size is
+  // valid for this snapshot.
+  // Setting flush_memtable to true does Flush before recording the live files.
+  // Setting flush_memtable to false is useful when we don't want to wait for
+  // flush which may have to wait for compaction to complete taking an
+  // indeterminate time. But this will have to use GetSortedWalFiles after
+  // GetLiveFiles to compensate for memtables missed in this snapshot due to the
+  // absence of Flush, by WAL files to recover the database consistently later
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) = 0;
+
+  // Retrieve the sorted list of all wal files with earliest file first
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
+
+  // The sequence number of the most recent transaction.
+  virtual SequenceNumber GetLatestSequenceNumber() const = 0;
+
+  // Sets iter to an iterator that is positioned at a write-batch containing
+  // seq_number. If the sequence number is non existent, it returns an iterator
+  // at the first available seq_no after the requested seq_no
+  // Returns Status::Ok if iterator is valid
+  // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+  // use this api, else the WAL files will get
+  // cleared aggressively and the iterator might keep getting invalid before
+  // an update is read.
+  virtual Status GetUpdatesSince(SequenceNumber seq_number,
+                                 unique_ptr<TransactionLogIterator>* iter) = 0;
+
+  // Delete the file name from the db directory and update the internal state to
+  // reflect that. Supports deletion of sst and log files only. 'name' must be
+  // path relative to the db directory. eg. 000001.sst, /archive/000003.log
+  virtual Status DeleteFile(std::string name) = 0;
+
+  // Returns a list of all table files with their level, start key
+  // and end key
+  virtual void GetLiveFilesMetaData(
+    std::vector<LiveFileMetaData> *metadata) {
+  }
+
+ private:
+  // No copying allowed
+  DB(const DB&);
+  void operator=(const DB&);
+};
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options);
+
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+Status RepairDB(const std::string& dbname, const Options& options);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_DB_H_
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -0,0 +1,649 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc.  Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
+#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
+
+#include <cstdarg>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class FileLock;
+class Logger;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+class WritableFile;
+class RandomRWFile;
+struct Options;
+
+using std::unique_ptr;
+using std::shared_ptr;
+
+
+// Options while opening a file to read/write
+struct EnvOptions {
+
+  // construct with default Options
+  EnvOptions();
+
+  // construct from Options
+  explicit EnvOptions(const Options& options);
+
+  // If true, then allow caching of data in environment buffers
+  bool use_os_buffer = true;
+
+   // If true, then use mmap to read data
+  bool use_mmap_reads = false;
+
+   // If true, then use mmap to write data
+  bool use_mmap_writes = true;
+
+  // If true, set the FD_CLOEXEC on open fd.
+  bool set_fd_cloexec= true;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, in the background. Issue one request for every bytes_per_sync
+  // written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync = 0;
+};
+
+class Env {
+ public:
+  Env() { }
+  virtual ~Env();
+
+  // Return a default environment suitable for the current operating
+  // system.  Sophisticated users may wish to provide their own Env
+  // implementation instead of relying on this default environment.
+  //
+  // The result of Default() belongs to rocksdb and must never be deleted.
+  static Env* Default();
+
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options)
+                                   = 0;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options)
+                                     = 0;
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Create an object that both reads and writes to a file on
+  // specified offsets (random access). If file already exists,
+  // does not overwrite it. On success, stores a pointer to the
+  // new file in *result and returns OK. On failure stores nullptr
+  // in *result and returns non-OK.
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Returns true iff the named file exists.
+  virtual bool FileExists(const std::string& fname) = 0;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir".
+  // Original contents of *results are dropped.
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) = 0;
+
+  // Delete the named file.
+  virtual Status DeleteFile(const std::string& fname) = 0;
+
+  // Create the specified directory. Returns error if directory exists.
+  virtual Status CreateDir(const std::string& dirname) = 0;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
+
+  // Delete the specified directory.
+  virtual Status DeleteDir(const std::string& dirname) = 0;
+
+  // Store the size of fname in *file_size.
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+  // Store the last modification time of fname in *file_mtime.
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) = 0;
+  // Rename file src to target.
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) = 0;
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  virtual Status UnlockFile(FileLock* lock) = 0;
+
+  enum Priority { LOW, HIGH, TOTAL };
+
+  // Arrange to run "(*function)(arg)" once in a background thread, in
+  // the thread pool specified by pri. By default, jobs go to the 'LOW'
+  // priority thread pool.
+
+  // "function" may run in an unspecified thread.  Multiple functions
+  // added to the same Env may run concurrently in different threads.
+  // I.e., the caller may not assume that background work items are
+  // serialized.
+  virtual void Schedule(
+      void (*function)(void* arg),
+      void* arg,
+      Priority pri = LOW) = 0;
+
+  // Start a new thread, invoking "function(arg)" within the new thread.
+  // When "function(arg)" returns, the thread will be destroyed.
+  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or many not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  virtual Status GetTestDirectory(std::string* path) = 0;
+
+  // Create and return a log file for storing informational messages.
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) = 0;
+
+  // Returns the number of micro-seconds since some fixed point in time. Only
+  // useful for computing deltas of time.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros
+  virtual uint64_t NowNanos() {
+    return NowMicros() * 1000;
+  }
+
+  // Sleep/delay the thread for the perscribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the current host name.
+  virtual Status GetHostName(char* name, uint64_t len) = 0;
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Get full directory name for this db.
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) = 0;
+
+  // The number of background worker threads of a specific thread pool
+  // for this environment. 'LOW' is the default pool.
+  // default number: 1
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+
+  // Generates a unique id that can be used to identify a db
+  virtual std::string GenerateUniqueId();
+
+ private:
+  // No copying allowed
+  Env(const Env&);
+  void operator=(const Env&);
+};
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+  SequentialFile() { }
+  virtual ~SequentialFile();
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Skip(uint64_t n) = 0;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+  RandomAccessFile() { }
+  virtual ~RandomAccessFile();
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to eachother by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return 0; // Default implementation to prevent issues with backwards
+              // compatibility.
+  };
+
+
+  enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+  virtual void Hint(AccessPattern pattern) {}
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+  WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
+  }
+  virtual ~WritableFile();
+
+  virtual Status Append(const Slice& data) = 0;
+  virtual Status Close() = 0;
+  virtual Status Flush() = 0;
+  virtual Status Sync() = 0; // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() {
+    return Sync();
+  }
+
+  /*
+   * Get the size of valid data in the file.
+   */
+  virtual uint64_t GetFileSize() {
+    return 0;
+  }
+
+  /*
+   * Get and set the default pre-allocation block size for writes to
+   * this file.  If non-zero, then Allocate will be used to extend the
+   * underlying storage of a file (generally via fallocate) if the Env
+   * instance supports it.
+   */
+  void SetPreallocationBlockSize(size_t size) {
+    preallocation_block_size_ = size;
+  }
+
+  virtual void GetPreallocationStatus(size_t* block_size,
+                                      size_t* last_allocated_block) {
+    *last_allocated_block = last_preallocated_block_;
+    *block_size = preallocation_block_size_;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return 0; // Default implementation to prevent issues with backwards
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+
+ protected:
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  void PrepareWrite(size_t offset, size_t len) {
+    if (preallocation_block_size_ == 0) {
+      return;
+    }
+    // If this write would cross one or more preallocation blocks,
+    // determine what the last preallocation block necesessary to
+    // cover this write would be and Allocate to that point.
+    const auto block_size = preallocation_block_size_;
+    size_t new_last_preallocated_block =
+      (offset + len + block_size - 1) / block_size;
+    if (new_last_preallocated_block > last_preallocated_block_) {
+      size_t num_spanned_blocks =
+        new_last_preallocated_block - last_preallocated_block_;
+      Allocate(block_size * last_preallocated_block_,
+               block_size * num_spanned_blocks);
+      last_preallocated_block_ = new_last_preallocated_block;
+    }
+  }
+
+  /*
+   * Pre-allocate space for a file.
+   */
+  virtual Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual Status RangeSync(off_t offset, off_t nbytes) {
+    return Status::OK();
+  }
+
+ private:
+  size_t last_preallocated_block_;
+  size_t preallocation_block_size_;
+  // No copying allowed
+  WritableFile(const WritableFile&);
+  void operator=(const WritableFile&);
+};
+
+// A file abstraction for random reading and writing.
+class RandomRWFile {
+ public:
+  RandomRWFile() {}
+  virtual ~RandomRWFile() {}
+
+  // Write data from Slice data to file starting from offset
+  // Returns IOError on failure, but does not guarantee
+  // atomicity of a write.  Returns OK status on success.
+  //
+  // Safe for concurrent use.
+  virtual Status Write(uint64_t offset, const Slice& data) = 0;
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+  virtual Status Close() = 0; // closes the file
+  virtual Status Sync() = 0; // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() {
+    return Sync();
+  }
+
+  /*
+   * Pre-allocate space for a file.
+   */
+  virtual Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+
+ private:
+  // No copying allowed
+  RandomRWFile(const RandomRWFile&);
+  void operator=(const RandomRWFile&);
+};
+
+// An interface for writing log messages.
+class Logger {
+ public:
+  enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
+  Logger() { }
+  virtual ~Logger();
+
+  // Write an entry to the log file with the specified format.
+  virtual void Logv(const char* format, va_list ap) = 0;
+  virtual size_t GetLogFileSize() const {
+    return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
+  }
+  // Flush to the OS buffers
+  virtual void Flush() {}
+
+ private:
+  // No copying allowed
+  Logger(const Logger&);
+  void operator=(const Logger&);
+};
+
+
+// Identifies a locked file.
+class FileLock {
+ public:
+  FileLock() { }
+  virtual ~FileLock();
+ private:
+  // No copying allowed
+  FileLock(const FileLock&);
+  void operator=(const FileLock&);
+};
+
+
+extern void LogFlush(const shared_ptr<Logger>& info_log);
+
+// Log the specified data to *info_log if info_log is non-nullptr.
+extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 2, 3)))
+#   endif
+    ;
+
+extern void LogFlush(Logger *info_log);
+
+extern void Log(Logger* info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 2, 3)))
+#   endif
+    ;
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+                                const std::string& fname);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+                               std::string* data);
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit EnvWrapper(Env* t) : target_(t) { }
+  virtual ~EnvWrapper();
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_; }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status NewSequentialFile(const std::string& f,
+                           unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) {
+    return target_->NewSequentialFile(f, r, options);
+  }
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) {
+    return target_->NewRandomAccessFile(f, r, options);
+  }
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) {
+    return target_->NewWritableFile(f, r, options);
+  }
+  Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
+                         const EnvOptions& options) {
+    return target_->NewRandomRWFile(f, r, options);
+  }
+  bool FileExists(const std::string& f) { return target_->FileExists(f); }
+  Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
+    return target_->GetChildren(dir, r);
+  }
+  Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
+  Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
+  Status CreateDirIfMissing(const std::string& d) {
+    return target_->CreateDirIfMissing(d);
+  }
+  Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
+  Status GetFileSize(const std::string& f, uint64_t* s) {
+    return target_->GetFileSize(f, s);
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) {
+    return target_->GetFileModificationTime(fname, file_mtime);
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) {
+    return target_->RenameFile(s, t);
+  }
+  Status LockFile(const std::string& f, FileLock** l) {
+    return target_->LockFile(f, l);
+  }
+  Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
+  void Schedule(void (*f)(void*), void* a, Priority pri) {
+    return target_->Schedule(f, a, pri);
+  }
+  void StartThread(void (*f)(void*), void* a) {
+    return target_->StartThread(f, a);
+  }
+  virtual Status GetTestDirectory(std::string* path) {
+    return target_->GetTestDirectory(path);
+  }
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) {
+    return target_->NewLogger(fname, result);
+  }
+  uint64_t NowMicros() {
+    return target_->NowMicros();
+  }
+  void SleepForMicroseconds(int micros) {
+    target_->SleepForMicroseconds(micros);
+  }
+  Status GetHostName(char* name, uint64_t len) {
+    return target_->GetHostName(name, len);
+  }
+  Status GetCurrentTime(int64_t* unix_time) {
+    return target_->GetCurrentTime(unix_time);
+  }
+  Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) {
+    return target_->GetAbsolutePath(db_path, output_path);
+  }
+  void SetBackgroundThreads(int num, Priority pri) {
+    return target_->SetBackgroundThreads(num, pri);
+  }
+  std::string TimeToString(uint64_t time) {
+    return target_->TimeToString(time);
+  }
+
+ private:
+  Env* target_;
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ENV_H_
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys.  These filters are stored in rocksdb and are consulted
+// automatically by rocksdb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+class FilterPolicy {
+ public:
+  virtual ~FilterPolicy();
+
+  // Return the name of this policy.  Note that if the filter encoding
+  // changes in an incompatible way, the name returned by this method
+  // must be changed.  Otherwise, old incompatible filters may be
+  // passed to methods of this type.
+  virtual const char* Name() const = 0;
+
+  // keys[0,n-1] contains a list of keys (potentially with duplicates)
+  // that are ordered according to the user supplied comparator.
+  // Append a filter that summarizes keys[0,n-1] to *dst.
+  //
+  // Warning: do not change the initial contents of *dst.  Instead,
+  // append the newly constructed filter to *dst.
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
+      const = 0;
+
+  // "filter" contains the data appended by a preceding call to
+  // CreateFilter() on this class.  This method must return true if
+  // the key was in the list of keys passed to CreateFilter().
+  // This method may return true or false if the key was not on the
+  // list, but it should aim to return false with a high probability.
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key.  A good value for bits_per_key
+// is 10, which yields a filter with ~ 1% false positive rate.
+//
+// Callers must delete the result after any database that is using the
+// result has been closed.
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys.  For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
+
+}
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
--- a/include/rocksdb/flush_block_policy.h
+++ b/include/rocksdb/flush_block_policy.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+class BlockBuilder;
+
+// FlushBlockPolicy provides a configurable way to determine when to flush a
+// block in the block based tables,
+class FlushBlockPolicy {
+ public:
+  // Keep track of the key/value sequences and return the boolean value to
+  // determine if table builder should flush current data block.
+  virtual bool Update(const Slice& key,
+                      const Slice& value) = 0;
+
+  virtual ~FlushBlockPolicy() { }
+};
+
+class FlushBlockPolicyFactory {
+ public:
+  // Return the name of the flush block policy.
+  virtual const char* Name() const = 0;
+
+  // Return a new block flush policy that flushes data blocks by data size.
+  // FlushBlockPolicy may need to access the metadata of the data block
+  // builder to determine when to flush the blocks.
+  //
+  // Callers must delete the result after any database that is using the
+  // result has been closed.
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBuilder& data_block_builder) const = 0;
+
+  virtual ~FlushBlockPolicyFactory() { }
+};
+
+class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  FlushBlockBySizePolicyFactory(const uint64_t block_size,
+                                const uint64_t block_size_deviation) :
+      block_size_(block_size),
+      block_size_deviation_(block_size_deviation) {
+  }
+
+  virtual const char* Name() const override {
+    return "FlushBlockBySizePolicyFactory";
+  }
+
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBuilder& data_block_builder) const override;
+
+ private:
+  const uint64_t block_size_;
+  const uint64_t block_size_deviation_;
+};
+
+}  // rocksdb
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface.  Multiple implementations
+// are provided by this library.  In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Iterator {
+ public:
+  Iterator();
+  virtual ~Iterator();
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the last key in the source.  The iterator is
+  // Valid() after this call iff the source is not empty.
+  virtual void SeekToLast() = 0;
+
+  // Position at the first key in the source that at or past target
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  virtual void Seek(const Slice& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Moves to the previous entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the first entry in source.
+  // REQUIRES: Valid()
+  virtual void Prev() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual Slice key() const = 0;
+
+  // Return the value for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: !AtEnd() && !AtStart()
+  virtual Slice value() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  // If non-blocking IO is requested and this operation cannot be
+  // satisfied without doing some IO, then this returns Status::Incomplete().
+  virtual Status status() const = 0;
+
+  // Clients are allowed to register function/arg1/arg2 triples that
+  // will be invoked when this iterator is destroyed.
+  //
+  // Note that unlike all of the preceding methods, this method is
+  // not abstract and therefore clients should not override it.
+  typedef void (*CleanupFunction)(void* arg1, void* arg2);
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+  struct Cleanup {
+    CleanupFunction function;
+    void* arg1;
+    void* arg2;
+    Cleanup* next;
+  };
+  Cleanup cleanup_;
+
+  // No copying allowed
+  Iterator(const Iterator&);
+  void operator=(const Iterator&);
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
--- a/include/rocksdb/ldb_tool.h
+++ b/include/rocksdb/ldb_tool.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#ifndef STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
+#define STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+class LDBTool {
+ public:
+  void Run(int argc, char** argv, Options = Options());
+};
+
+} // namespace rocksdb
+
+#endif // STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -0,0 +1,276 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file contains the interface that must be implemented by any collection
+// to be used as the backing store for a MemTable. Such a collection must
+// satisfy the following properties:
+//  (1) It does not store duplicate items.
+//  (2) It uses MemTableRep::KeyComparator to compare items for iteration and
+//     equality.
+//  (3) It can be accessed concurrently by multiple readers and can support
+//     during reads. However, it needn't support multiple concurrent writes.
+//  (4) Items are never deleted.
+// The liberal use of assertions is encouraged to enforce (1).
+//
+// The factory will be passed an Arena object when a new MemTableRep is
+// requested. The API for this object is in rocksdb/arena.h.
+//
+// Users can implement their own memtable representations. We include four
+// types built in:
+//  - SkipListRep: This is the default; it is backed by a skip list.
+//  - TransformRep: This is backed by an custom hash map.
+// On construction, they are given a SliceTransform object. This
+// object is applied to the user key of stored items which indexes into the
+// hash map to yield a skiplist containing all records that share the same
+// user key under the transform function.
+//  - UnsortedRep: A subclass of TransformRep where the transform function is
+// the identity function. Optimized for point lookups.
+//  - PrefixHashRep: A subclass of TransformRep where the transform function is
+// a fixed-size prefix extractor. If you use PrefixHashRepFactory, the transform
+// must be identical to options.prefix_extractor, otherwise it will be discarded
+// and the default will be used. It is optimized for ranged scans over a
+// prefix.
+//  - VectorRep: This is backed by an unordered std::vector. On iteration, the
+// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
+// has been called, the vector will only be sorted once. It is optimized for
+// random-write-heavy workloads.
+//
+// The last four implementations are designed for situations in which
+// iteration over the entire collection is rare since doing so requires all the
+// keys to be copied into a sorted data structure.
+
+#ifndef STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
+#define STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
+
+#include <memory>
+#include "rocksdb/arena.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+
+namespace rocksdb {
+
+class MemTableRep {
+ public:
+  // KeyComparator provides a means to compare keys, which are internal keys
+  // concatenated with values.
+  class KeyComparator {
+   public:
+    // Compare a and b. Return a negative value if a is less than b, 0 if they
+    // are equal, and a positive value if a is greater than b
+    virtual int operator()(const char* a, const char* b) const = 0;
+
+    virtual ~KeyComparator() { }
+  };
+
+  // Insert key into the collection. (The caller will pack key and value into a
+  // single buffer and pass that in as the parameter to Insert)
+  // REQUIRES: nothing that compares equal to key is currently in the
+  // collection.
+  virtual void Insert(const char* key) = 0;
+
+  // Returns true iff an entry that compares equal to key is in the collection.
+  virtual bool Contains(const char* key) const = 0;
+
+  // Notify this table rep that it will no longer be added to. By default, does
+  // nothing.
+  virtual void MarkReadOnly() { }
+
+  // Report an approximation of how much memory has been used other than memory
+  // that was allocated through the arena.
+  virtual size_t ApproximateMemoryUsage() = 0;
+
+  virtual ~MemTableRep() { }
+
+  // Iteration over the contents of a skip collection
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified collection.
+    // The returned iterator is not valid.
+    // explicit Iterator(const MemTableRep* collection);
+    virtual ~Iterator() { };
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const = 0;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const = 0;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() = 0;
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() = 0;
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const char* target) = 0;
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() = 0;
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() = 0;
+  };
+
+  // Return an iterator over the keys in this representation.
+  virtual std::shared_ptr<Iterator> GetIterator() = 0;
+
+  // Return an iterator over at least the keys with the specified user key. The
+  // iterator may also allow access to other keys, but doesn't have to. Default:
+  // GetIterator().
+  virtual std::shared_ptr<Iterator> GetIterator(const Slice& user_key) {
+    return GetIterator();
+  }
+
+  // Return an iterator over at least the keys with the specified prefix. The
+  // iterator may also allow access to other keys, but doesn't have to. Default:
+  // GetIterator().
+  virtual std::shared_ptr<Iterator> GetPrefixIterator(const Slice& prefix) {
+    return GetIterator();
+  }
+
+  // Return an iterator that has a special Seek semantics. The result of
+  // a Seek might only include keys with the same prefix as the target key.
+  virtual std::shared_ptr<Iterator> GetDynamicPrefixIterator() {
+    return GetIterator();
+  }
+
+ protected:
+  // When *key is an internal key concatenated with the value, returns the
+  // user key.
+  virtual Slice UserKey(const char* key) const;
+};
+
+// This is the base class for all factories that are used by RocksDB to create
+// new MemTableRep objects
+class MemTableRepFactory {
+ public:
+  virtual ~MemTableRepFactory() { };
+  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
+    MemTableRep::KeyComparator&, Arena*) = 0;
+  virtual const char* Name() const = 0;
+};
+
+// This creates MemTableReps that are backed by an std::vector. On iteration,
+// the vector is sorted. This is useful for workloads where iteration is very
+// rare and writes are generally not issued after reads begin.
+//
+// Parameters:
+//   count: Passed to the constructor of the underlying std::vector of each
+//     VectorRep. On initialization, the underlying array will be at least count
+//     bytes reserved for usage.
+class VectorRepFactory : public MemTableRepFactory {
+  const size_t count_;
+public:
+  explicit VectorRepFactory(size_t count = 0) : count_(count) { }
+  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
+    MemTableRep::KeyComparator&, Arena*) override;
+  virtual const char* Name() const override {
+    return "VectorRepFactory";
+  }
+};
+
+// This uses a skip list to store keys. It is the default.
+class SkipListFactory : public MemTableRepFactory {
+public:
+  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
+    MemTableRep::KeyComparator&, Arena*) override;
+  virtual const char* Name() const override {
+    return "SkipListFactory";
+  }
+};
+
+// TransformReps are backed by an unordered map of buffers to buckets. When
+// looking up a key, the user key is extracted and a user-supplied transform
+// function (see rocksdb/slice_transform.h) is applied to get the key into the
+// unordered map. This allows the user to bin user keys based on arbitrary
+// criteria. Two example implementations are UnsortedRepFactory and
+// PrefixHashRepFactory.
+//
+// Iteration over the entire collection is implemented by dumping all the keys
+// into an std::set. Thus, these data structures are best used when iteration
+// over the entire collection is rare.
+//
+// Parameters:
+//   transform: The SliceTransform to bucket user keys on. TransformRepFactory
+//     owns the pointer.
+//   bucket_count: Passed to the constructor of the underlying
+//     std::unordered_map of each TransformRep. On initialization, the
+//     underlying array will be at least bucket_count size.
+//   num_locks: Number of read-write locks to have for the rep. Each bucket is
+//     hashed onto a read-write lock which controls access to that lock. More
+//     locks means finer-grained concurrency but more memory overhead.
+class TransformRepFactory : public MemTableRepFactory {
+ public:
+  explicit TransformRepFactory(const SliceTransform* transform,
+    size_t bucket_count, size_t num_locks = 1000)
+    : transform_(transform),
+      bucket_count_(bucket_count),
+      num_locks_(num_locks) { }
+
+  virtual ~TransformRepFactory() { delete transform_; }
+
+  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
+    MemTableRep::KeyComparator&, Arena*) override;
+
+  virtual const char* Name() const override {
+    return "TransformRepFactory";
+  }
+
+  const SliceTransform* GetTransform() { return transform_; }
+
+ protected:
+  const SliceTransform* transform_;
+  const size_t bucket_count_;
+  const size_t num_locks_;
+};
+
+// UnsortedReps bin user keys based on an identity function transform -- that
+// is, transform(key) = key. This optimizes for point look-ups.
+//
+// Parameters: See TransformRepFactory.
+class UnsortedRepFactory : public TransformRepFactory {
+public:
+  explicit UnsortedRepFactory(size_t bucket_count = 0, size_t num_locks = 1000)
+    : TransformRepFactory(NewNoopTransform(),
+                          bucket_count,
+                          num_locks) { }
+  virtual const char* Name() const override {
+    return "UnsortedRepFactory";
+  }
+};
+
+// PrefixHashReps bin user keys based on a fixed-size prefix. This optimizes for
+// short ranged scans over a given prefix.
+//
+// Parameters: See TransformRepFactory.
+class PrefixHashRepFactory : public TransformRepFactory {
+public:
+  explicit PrefixHashRepFactory(const SliceTransform* prefix_extractor,
+    size_t bucket_count = 0, size_t num_locks = 1000)
+    : TransformRepFactory(prefix_extractor, bucket_count, num_locks)
+    { }
+
+  virtual std::shared_ptr<MemTableRep> CreateMemTableRep(
+    MemTableRep::KeyComparator&, Arena*) override;
+
+  virtual const char* Name() const override {
+    return "PrefixHashRepFactory";
+  }
+};
+
+// The same as TransformRepFactory except it doesn't use locks.
+// Experimental, will replace TransformRepFactory once we are sure
+// it performs better
+extern MemTableRepFactory* NewHashSkipListRepFactory(
+    const SliceTransform* transform, size_t bucket_count = 1000000);
+
+}
+
+#endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
--- a/include/rocksdb/merge_operator.h
+++ b/include/rocksdb/merge_operator.h
@@ -0,0 +1,148 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
+
+#include <string>
+#include <deque>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class Slice;
+class Logger;
+
+// The Merge Operator
+//
+// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
+// client knows. It could be numeric addition, list append, string
+// concatenation, edit data structure, ... , anything.
+// The library, on the other hand, is concerned with the exercise of this
+// interface, at the right time (during get, iteration, compaction...)
+//
+// To use merge, the client needs to provide an object implementing one of
+// the following interfaces:
+//  a) AssociativeMergeOperator - for most simple semantics (always take
+//    two values, and merge them into one value, which is then put back
+//    into rocksdb); numeric addition and string concatenation are examples;
+//
+//  b) MergeOperator - the generic class for all the more abstract / complex
+//    operations; one method (FullMerge) to merge a Put/Delete value with a
+//    merge operand; and another method (PartialMerge) that merges two
+//    operands together. this is especially useful if your key values have a
+//    complex structure but you would still like to support client-specific
+//    incremental updates.
+//
+// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
+// more powerful.
+//
+// Refer to rocksdb-merge wiki for more details and example implementations.
+//
+class MergeOperator {
+ public:
+  virtual ~MergeOperator() {}
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:      (IN)    The key that's associated with this merge operation.
+  //                   Client could multiplex the merge operator based on it
+  //                   if the key space is partitioned and different subspaces
+  //                   refer to different types of data which have different
+  //                   merge operation semantics
+  // existing: (IN)    null indicates that the key does not exist before this op
+  // operand_list:(IN) the sequence of merge operations to apply, front() first.
+  // new_value:(OUT)   Client is responsible for filling the merge result here
+  // logger:   (IN)    Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. This will be treated as an error by the library.
+  //
+  // Also make use of the *logger for error messages.
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const = 0;
+
+  // This function performs merge(left_op, right_op)
+  // when both the operands are themselves merge operation types
+  // that you would have passed to a DB::Merge() call in the same order
+  // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
+  //
+  // PartialMerge should combine them into a single merge operation that is
+  // saved into *new_value, and then it should return true.
+  // *new_value should be constructed such that a call to
+  // DB::Merge(key, *new_value) would yield the same result as a call
+  // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
+  //
+  // If it is impossible or infeasible to combine the two operations,
+  // leave new_value unchanged and return false. The library will
+  // internally keep track of the operations, and apply them in the
+  // correct order once a base-value (a Put/Delete/End-of-Database) is seen.
+  //
+  // TODO: Presently there is no way to differentiate between error/corruption
+  // and simply "return false". For now, the client should simply return
+  // false in any case it cannot perform partial-merge, regardless of reason.
+  // If there is corruption in the data, handle it in the FullMerge() function,
+  // and return false there.
+  virtual bool PartialMerge(const Slice& key,
+                            const Slice& left_operand,
+                            const Slice& right_operand,
+                            std::string* new_value,
+                            Logger* logger) const = 0;
+
+  // The name of the MergeOperator. Used to check for MergeOperator
+  // mismatches (i.e., a DB created with one MergeOperator is
+  // accessed using a different MergeOperator)
+  // TODO: the name is currently not stored persistently and thus
+  //       no checking is enforced. Client is responsible for providing
+  //       consistent MergeOperator between DB opens.
+  virtual const char* Name() const = 0;
+};
+
+// The simpler, associative merge operator.
+class AssociativeMergeOperator : public MergeOperator {
+ public:
+  virtual ~AssociativeMergeOperator() {}
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:           (IN) The key that's associated with this merge operation.
+  // existing_value:(IN) null indicates the key does not exist before this op
+  // value:         (IN) the value to update/merge the existing_value with
+  // new_value:    (OUT) Client is responsible for filling the merge result here
+  // logger:        (IN) Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. The client should assume that this will be treated
+  // as an error by the library.
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const = 0;
+
+
+ private:
+  // Default implementations of the MergeOperator functions
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const override;
+
+  virtual bool PartialMerge(const Slice& key,
+                            const Slice& left_operand,
+                            const Slice& right_operand,
+                            std::string* new_value,
+                            Logger* logger) const override;
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -0,0 +1,742 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+
+#include <stddef.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/universal_compaction.h"
+
+namespace rocksdb {
+
+class Cache;
+class CompactionFilter;
+class CompactionFilterFactory;
+class Comparator;
+class Env;
+class FilterPolicy;
+class Logger;
+class MergeOperator;
+class Snapshot;
+class TableFactory;
+
+using std::shared_ptr;
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs.  Each block may be compressed before
+// being stored in a file.  The following enum describes which
+// compression method (if any) is used to compress a block.
+enum CompressionType : char {
+  // NOTE: do not change the values of existing entries, as these are
+  // part of the persistent format on disk.
+  kNoCompression     = 0x0,
+  kSnappyCompression = 0x1,
+  kZlibCompression = 0x2,
+  kBZip2Compression = 0x3
+};
+
+enum CompactionStyle : char {
+  kCompactionStyleLevel       = 0x0, // level based compaction style
+  kCompactionStyleUniversal   = 0x1  // Universal compaction style
+};
+
+// Compression options for different compression algorithms like Zlib
+struct CompressionOptions {
+  int window_bits;
+  int level;
+  int strategy;
+  CompressionOptions():window_bits(-14),
+                       level(-1),
+                       strategy(0){}
+  CompressionOptions(int wbits, int lev, int strategy):window_bits(wbits),
+                                                       level(lev),
+                                                       strategy(strategy){}
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options {
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator;
+
+  // REQUIRES: The client must provide a merge operator if Merge operation
+  // needs to be accessed. Calling Merge on a DB without a merge operator
+  // would result in Status::NotSupported. The client must ensure that the
+  // merge operator supplied here has the same name and *exactly* the same
+  // semantics as the merge operator provided to previous open calls on
+  // the same DB. The only exception is reserved for upgrade, where a DB
+  // previously without a merge operator is introduced to Merge operation
+  // for the first time. It's necessary to specify a merge operator when
+  // openning the DB in this case.
+  // Default: nullptr
+  shared_ptr<MergeOperator> merge_operator;
+
+  // The client must provide compaction_filter_factory if it requires a new
+  // compaction filter to be used for different compaction processes
+  // Allows an application to modify/delete a key-value during background
+  // compaction.
+  // Ideally, client should specify only one of filter or factory.
+  // compaction_filter takes precedence over compaction_filter_factory if
+  // client specifies both.
+  // Default: nullptr
+  const CompactionFilter* compaction_filter;
+
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists;
+
+  // If true, the implementation will do aggressive checking of the
+  // data it is processing and will stop early if it detects any
+  // errors.  This may have unforeseen ramifications: for example, a
+  // corruption of one DB entry may cause a large number of entries to
+  // become unreadable or for the entire DB to become unopenable.
+  // If any of the  writes to the database fails (Put, Delete, Merge, Write),
+  // the database will switch to read-only mode and fail all other
+  // Write operations.
+  // Default: false
+  bool paranoid_checks;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc.
+  // Default: Env::Default()
+  Env* env;
+
+  // Any internal progress/error information generated by the db will
+  // be written to info_log if it is non-nullptr, or to a file stored
+  // in the same directory as the DB contents if info_log is nullptr.
+  // Default: nullptr
+  shared_ptr<Logger> info_log;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory (backed by an unsorted log
+  // on disk) before converting to a sorted on-disk file.
+  //
+  // Larger values increase performance, especially during bulk loads.
+  // Up to max_write_buffer_number write buffers may be held in memory
+  // at the same time,
+  // so you may wish to adjust this parameter to control memory usage.
+  // Also, a larger write buffer will result in a longer recovery time
+  // the next time the database is opened.
+  //
+  // Default: 4MB
+  size_t write_buffer_size;
+
+  // The maximum number of write buffers that are built up in memory.
+  // The default is 2, so that when 1 write buffer is being flushed to
+  // storage, new writes can continue to the other write buffer.
+  // Default: 2
+  int max_write_buffer_number;
+
+  // The minimum number of write buffers that will be merged together
+  // before writing to storage.  If set to 1, then
+  // all write buffers are fushed to L0 as individual files and this increases
+  // read amplification because a get request has to check in all of these
+  // files. Also, an in-memory merge may result in writing lesser
+  // data to storage if there are duplicate records in each of these
+  // individual write buffers.  Default: 1
+  int min_write_buffer_number_to_merge;
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set (budget
+  // one open file per 2MB of working set).
+  //
+  // Default: 1000
+  int max_open_files;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // If non-NULL use the specified cache for blocks.
+  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
+  // Default: nullptr
+  shared_ptr<Cache> block_cache;
+
+  // If non-NULL use the specified cache for compressed blocks.
+  // If NULL, rocksdb will not use a compressed block cache.
+  // Default: nullptr
+  shared_ptr<Cache> block_cache_compressed;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  //
+  // Default: 4K
+  size_t block_size;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  //
+  // Default: 16
+  int block_restart_interval;
+
+
+  // Compress blocks using the specified compression algorithm.  This
+  // parameter can be changed dynamically.
+  //
+  // Default: kSnappyCompression, which gives lightweight but fast
+  // compression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  CompressionType compression;
+
+  // Different levels can have different compression policies. There
+  // are cases where most lower levels would like to quick compression
+  // algorithm while the higher levels (which have more data) use
+  // compression algorithms that have better compression but could
+  // be slower. This array, if non nullptr, should have an entry for
+  // each level of the database. This array, if non nullptr, overides the
+  // value specified in the previous field 'compression'. The caller is
+  // reponsible for allocating memory and initializing the values in it
+  // before invoking Open(). The caller is responsible for freeing this
+  // array and it could be freed anytime after the return from Open().
+  // This could have been a std::vector but that makes the equivalent
+  // java/C api hard to construct.
+  std::vector<CompressionType> compression_per_level;
+
+  //different options for compression algorithms
+  CompressionOptions compression_opts;
+
+  // If non-nullptr, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  //
+  // Default: nullptr
+  const FilterPolicy* filter_policy;
+
+  // If non-nullptr, use the specified function to determine the
+  // prefixes for keys.  These prefixes will be placed in the filter.
+  // Depending on the workload, this can reduce the number of read-IOP
+  // cost for scans when a prefix is passed via ReadOptions to
+  // db.NewIterator().  For prefix filtering to work properly,
+  // "prefix_extractor" and "comparator" must be such that the following
+  // properties hold:
+  //
+  // 1) key.starts_with(prefix(key))
+  // 2) Compare(prefix(key), key) <= 0.
+  // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
+  // 4) prefix(prefix(key)) == prefix(key)
+  //
+  // Default: nullptr
+  const SliceTransform* prefix_extractor;
+
+  // If true, place whole keys in the filter (not just prefixes).
+  // This must generally be true for gets to be efficient.
+  //
+  // Default: true
+  bool whole_key_filtering;
+
+  // Number of levels for this database
+  int num_levels;
+
+  // Number of files to trigger level-0 compaction. A value <0 means that
+  // level-0 compaction will not be triggered by number of files at all.
+  int level0_file_num_compaction_trigger;
+
+  // Soft limit on number of level-0 files. We start slowing down writes at this
+  // point. A value <0 means that no writing slow down will be triggered by
+  // number of files in level-0.
+  int level0_slowdown_writes_trigger;
+
+  // Maximum number of level-0 files.  We stop writes at this point.
+  int level0_stop_writes_trigger;
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.  We try to push to level 2 to avoid the
+  // relatively expensive level 0=>1 compactions and to avoid some
+  // expensive manifest file operations.  We do not push all the way to
+  // the largest level since that can generate a lot of wasted disk
+  // space if the same key space is being repeatedly overwritten.
+  int max_mem_compaction_level;
+
+  // Target file size for compaction.
+  // target_file_size_base is per-file size for level-1.
+  // Target file size for level L can be calculated by
+  // target_file_size_base * (target_file_size_multiplier ^ (L-1))
+  // For example, if target_file_size_base is 2MB and
+  // target_file_size_multiplier is 10, then each file on level-1 will
+  // be 2MB, and each file on level 2 will be 20MB,
+  // and each file on level-3 will be 200MB.
+
+  // by default target_file_size_base is 2MB.
+  int target_file_size_base;
+  // by default target_file_size_multiplier is 1, which means
+  // by default files in different levels will have similar size.
+  int target_file_size_multiplier;
+
+  // Control maximum total data size for a level.
+  // max_bytes_for_level_base is the max total for level-1.
+  // Maximum number of bytes for level L can be calculated as
+  // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
+  // For example, if max_bytes_for_level_base is 20MB, and if
+  // max_bytes_for_level_multiplier is 10, total data size for level-1
+  // will be 20MB, total file size for level-2 will be 200MB,
+  // and total file size for level-3 will be 2GB.
+
+
+  // by default 'max_bytes_for_level_base' is 10MB.
+  uint64_t max_bytes_for_level_base;
+  // by default 'max_bytes_for_level_base' is 10.
+  int max_bytes_for_level_multiplier;
+
+  // Different max-size multipliers for different levels.
+  // These are multiplied by max_bytes_for_level_multiplier to arrive
+  // at the max-size of each level.
+  // Default: 1
+  std::vector<int> max_bytes_for_level_multiplier_additional;
+
+  // Maximum number of bytes in all compacted files.  We avoid expanding
+  // the lower level file set of a compaction if it would make the
+  // total compaction cover more than
+  // (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+  int expanded_compaction_factor;
+
+  // Maximum number of bytes in all source files to be compacted in a
+  // single compaction run. We avoid picking too many files in the
+  // source level so that we do not exceed the total source bytes
+  // for compaction to exceed
+  // (source_compaction_factor * targetFileSizeLevel()) many bytes.
+  // Default:1, i.e. pick maxfilesize amount of data as the source of
+  // a compaction.
+  int source_compaction_factor;
+
+  // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+  // stop building a single file in a level->level+1 compaction.
+  int max_grandparent_overlap_factor;
+
+  // If non-null, then we should collect metrics about database operations
+  // Statistics objects should not be shared between DB instances as
+  // it does not use any locks to prevent concurrent updates.
+  shared_ptr<Statistics> statistics;
+
+  // If true, then the contents of data files are not synced
+  // to stable storage. Their contents remain in the OS buffers till the
+  // OS decides to flush them. This option is good for bulk-loading
+  // of data. Once the bulk-loading is complete, please issue a
+  // sync to the OS to flush all dirty buffesrs to stable storage.
+  // Default: false
+  bool disableDataSync;
+
+  // If true, then every store to stable storage will issue a fsync.
+  // If false, then every store to stable storage will issue a fdatasync.
+  // This parameter should be set to true while storing data to
+  // filesystem like ext3 that can lose files after a reboot.
+  // Default: false
+  bool use_fsync;
+
+  // This number controls how often a new scribe log about
+  // db deploy stats is written out.
+  // -1 indicates no logging at all.
+  // Default value is 1800 (half an hour).
+  int db_stats_log_interval;
+
+  // This specifies the info LOG dir.
+  // If it is empty, the log files will be in the same dir as data.
+  // If it is non empty, the log files will be in the specified dir,
+  // and the db data dir's absolute path will be used as the log file
+  // name's prefix.
+  std::string db_log_dir;
+
+  // This specifies the absolute dir path for write-ahead logs (WAL).
+  // If it is empty, the log files will be in the same dir as data,
+  //   dbname is used as the data dir by default
+  // If it is non empty, the log files will be in kept the specified dir.
+  // When destroying the db,
+  //   all log files in wal_dir and the dir itself is deleted
+  std::string wal_dir;
+
+  // Disable compaction triggered by seek.
+  // With bloomfilter and fast storage, a miss on one level
+  // is very cheap if the file handle is cached in table cache
+  // (which is true if max_open_files is large).
+  bool disable_seek_compaction;
+
+  // The periodicity when obsolete files get deleted. The default
+  // value is 6 hours. The files that get out of scope by compaction
+  // process will still get automatically delete on every compaction,
+  // regardless of this setting
+  uint64_t delete_obsolete_files_period_micros;
+
+  // Maximum number of concurrent background jobs, submitted to
+  // the default LOW priority thread pool
+  // Default: 1
+  int max_background_compactions;
+
+  // Maximum number of concurrent background memtable flush jobs, submitted to
+  // the HIGH priority thread pool.
+  // By default, all background jobs (major compaction and memtable flush) go
+  // to the LOW priority pool. If this option is set to a positive number,
+  // memtable flush jobs will be submitted to the HIGH priority pool.
+  // It is important when the same Env is shared by multiple db instances.
+  // Without a separate pool, long running major compaction jobs could
+  // potentially block memtable flush jobs of other db instances, leading to
+  // unnecessary Put stalls.
+  // Default: 0
+  int max_background_flushes;
+
+  // Specify the maximal size of the info log file. If the log file
+  // is larger than `max_log_file_size`, a new info log file will
+  // be created.
+  // If max_log_file_size == 0, all logs will be written to one
+  // log file.
+  size_t max_log_file_size;
+
+  // Time for the info log file to roll (in seconds).
+  // If specified with non-zero value, log file will be rolled
+  // if it has been active longer than `log_file_time_to_roll`.
+  // Default: 0 (disabled)
+  size_t log_file_time_to_roll;
+
+  // Maximal info log files to be kept.
+  // Default: 1000
+  size_t keep_log_file_num;
+
+  // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
+  // soft_rate_limit. This is ignored when == 0.0.
+  // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
+  // hold, RocksDB will set soft_rate_limit = hard_rate_limit
+  // Default: 0 (disabled)
+  double soft_rate_limit;
+
+  // Puts are delayed 1ms at a time when any level has a compaction score that
+  // exceeds hard_rate_limit. This is ignored when <= 1.0.
+  // Default: 0 (disabled)
+  double hard_rate_limit;
+
+  // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
+  // there is no limit.
+  // Default: 1000
+  unsigned int rate_limit_delay_max_milliseconds;
+
+  // manifest file is rolled over on reaching this limit.
+  // The older manifest file be deleted.
+  // The default value is MAX_INT so that roll-over does not take place.
+  uint64_t max_manifest_file_size;
+
+  // Disable block cache. If this is set to true,
+  // then no block cache should be used, and the block_cache should
+  // point to a nullptr object.
+  // Default: false
+  bool no_block_cache;
+
+  // Number of shards used for table cache.
+  int table_cache_numshardbits;
+
+  // During data eviction of table's LRU cache, it would be inefficient
+  // to strictly follow LRU because this piece of memory will not really
+  // be released unless its refcount falls to zero. Instead, make two
+  // passes: the first pass will release items with refcount = 1,
+  // and if not enough space releases after scanning the number of
+  // elements specified by this parameter, we will remove items in LRU
+  // order.
+  int table_cache_remove_scan_count_limit;
+
+  // size of one block in arena memory allocation.
+  // If <= 0, a proper value is automatically calculated (usually 1/10 of
+  // writer_buffer_size).
+  //
+  // Default: 0
+  size_t arena_block_size;
+
+  // Create an Options object with default values for all fields.
+  Options();
+
+  void Dump(Logger* log) const;
+
+  // Set appropriate parameters for bulk loading.
+  // The reason that this is a function that returns "this" instead of a
+  // constructor is to enable chaining of multiple similar calls in the future.
+  //
+  // All data will be in level 0 without any automatic compaction.
+  // It's recommended to manually call CompactRange(NULL, NULL) before reading
+  // from the database, because otherwise the read can be very slow.
+  Options* PrepareForBulkLoad();
+
+  // Disable automatic compactions. Manual compactions can still
+  // be issued on this database.
+  bool disable_auto_compactions;
+
+  // The following two fields affect how archived logs will be deleted.
+  // 1. If both set to 0, logs will be deleted asap and will not get into
+  //    the archive.
+  // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+  //    WAL files will be checked every 10 min and if total size is greater
+  //    then WAL_size_limit_MB, they will be deleted starting with the
+  //    earliest until size_limit is met. All empty files will be deleted.
+  // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+  //    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+  //    are older than WAL_ttl_seconds will be deleted.
+  // 4. If both are not 0, WAL files will be checked every 10 min and both
+  //    checks will be performed with ttl being first.
+  uint64_t WAL_ttl_seconds;
+  uint64_t WAL_size_limit_MB;
+
+  // Number of bytes to preallocate (via fallocate) the manifest
+  // files.  Default is 4mb, which is reasonable to reduce random IO
+  // as well as prevent overallocation for mounts that preallocate
+  // large amounts of data (such as xfs's allocsize option).
+  size_t manifest_preallocation_size;
+
+  // Purge duplicate/deleted keys when a memtable is flushed to storage.
+  // Default: true
+  bool purge_redundant_kvs_while_flush;
+
+  // Data being read from file storage may be buffered in the OS
+  // Default: true
+  bool allow_os_buffer;
+
+  // Allow the OS to mmap file for reading sst tables. Default: false
+  bool allow_mmap_reads;
+
+  // Allow the OS to mmap file for writing. Default: true
+  bool allow_mmap_writes;
+
+  // Disable child process inherit open files. Default: true
+  bool is_fd_close_on_exec;
+
+  // Skip log corruption error on recovery (If client is ok with
+  // losing most recent changes)
+  // Default: false
+  bool skip_log_error_on_recovery;
+
+  // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+  // Default: 3600 (1 hour)
+  unsigned int stats_dump_period_sec;
+
+  // This is used to close a block before it reaches the configured
+  // 'block_size'. If the percentage of free space in the current block is less
+  // than this specified number and adding a new record to the block will
+  // exceed the configured block size, then this block will be closed and the
+  // new record will be written to the next block.
+  // Default is 10.
+  int block_size_deviation;
+
+  // If set true, will hint the underlying file system that the file
+  // access pattern is random, when a sst file is opened.
+  // Default: true
+  bool advise_random_on_open;
+
+  // Specify the file access pattern once a compaction is started.
+  // It will be applied to all input files of a compaction.
+  // Default: NORMAL
+  enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start;
+
+  // Use adaptive mutex, which spins in the user space before resorting
+  // to kernel. This could reduce context switch when the mutex is not
+  // heavily contended. However, if the mutex is hot, we could end up
+  // wasting spin time.
+  // Default: false
+  bool use_adaptive_mutex;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, asynchronously, in the background.
+  // Issue one request for every bytes_per_sync written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync;
+
+  // The compaction style. Default: kCompactionStyleLevel
+  CompactionStyle compaction_style;
+
+  // The options needed to support Universal Style compactions
+  CompactionOptionsUniversal compaction_options_universal;
+
+  // Use KeyMayExist API to filter deletes when this is true.
+  // If KeyMayExist returns false, i.e. the key definitely does not exist, then
+  // the delete is a noop. KeyMayExist only incurs in-memory look up.
+  // This optimization avoids writing the delete to storage when appropriate.
+  // Default: false
+  bool filter_deletes;
+
+  // An iteration->Next() sequentially skips over keys with the same
+  // user-key unless this option is set. This number specifies the number
+  // of keys (with the same userkey) that will be sequentially
+  // skipped before a reseek is issued.
+  // Default: 8
+  uint64_t max_sequential_skip_in_iterations;
+
+  // This is a factory that provides MemTableRep objects.
+  // Default: a factory that provides a skip-list-based implementation of
+  // MemTableRep.
+  std::shared_ptr<MemTableRepFactory> memtable_factory;
+
+  // This is a factory that provides TableFactory objects.
+  // Default: a factory that provides a default implementation of
+  // Table and TableBuilder.
+  std::shared_ptr<TableFactory> table_factory;
+
+  // This is a factory that provides compaction filter objects which allow
+  // an application to modify/delete a key-value during background compaction.
+  // Default: a factory that doesn't provide any object
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
+
+  // This option allows user to to collect their own interested statistics of
+  // the tables.
+  // Default: emtpy vector -- no user-defined statistics collection will be
+  // performed.
+  std::vector<std::shared_ptr<TablePropertiesCollector>>
+    table_properties_collectors;
+
+  // Allows thread-safe inplace updates. Requires Updates iff
+  // * key exists in current memtable
+  // * new sizeof(new_value) <= sizeof(old_value)
+  // * old_value for that key is a put i.e. kTypeValue
+  // Default: false.
+  bool inplace_update_support;
+
+  // Number of locks used for inplace update
+  // Default: 10000, if inplace_update_support = true, else 0.
+  size_t inplace_update_num_locks;
+};
+
+//
+// An application can issue a read request (via Get/Iterators) and specify
+// if that read should process data that ALREADY resides on a specified cache
+// level. For example, if an application specifies kBlockCacheTier then the
+// Get call will process data that is already processed in the memtable or
+// the block cache. It will not page in data from the OS cache or data that
+// resides in storage.
+enum ReadTier {
+  kReadAllTier    = 0x0, // data in memtable, block cache, OS cache or storage
+  kBlockCacheTier = 0x1  // data in memtable or block cache
+};
+
+// Options that control read operations
+struct ReadOptions {
+  // If true, all data read from underlying storage will be
+  // verified against corresponding checksums.
+  // Default: false
+  bool verify_checksums;
+
+  // Should the "data block"/"index block"/"filter block" read for this
+  // iteration be cached in memory?
+  // Callers may wish to set this field to false for bulk scans.
+  // Default: true
+  bool fill_cache;
+
+  // If this option is set and memtable implementation allows, Seek
+  // might only return keys with the same prefix as the seek-key
+  bool prefix_seek;
+
+  // If "snapshot" is non-nullptr, read as of the supplied snapshot
+  // (which must belong to the DB that is being read and which must
+  // not have been released).  If "snapshot" is nullptr, use an impliicit
+  // snapshot of the state at the beginning of this read operation.
+  // Default: nullptr
+  const Snapshot* snapshot;
+
+  // If "prefix" is non-nullptr, and ReadOptions is being passed to
+  // db.NewIterator, only return results when the key begins with this
+  // prefix.  This field is ignored by other calls (e.g., Get).
+  // Options.prefix_extractor must also be set, and
+  // prefix_extractor.InRange(prefix) must be true.  The iterator
+  // returned by NewIterator when this option is set will behave just
+  // as if the underlying store did not contain any non-matching keys,
+  // with two exceptions.  Seek() only accepts keys starting with the
+  // prefix, and SeekToLast() is not supported.  prefix filter with this
+  // option will sometimes reduce the number of read IOPs.
+  // Default: nullptr
+  const Slice* prefix;
+
+  // Specify if this read request should process data that ALREADY
+  // resides on a particular cache. If the required data is not
+  // found at the specified cache, then Status::Incomplete is returned.
+  // Default: kReadAllTier
+  ReadTier read_tier;
+
+  ReadOptions()
+      : verify_checksums(false),
+        fill_cache(true),
+        prefix_seek(false),
+        snapshot(nullptr),
+        prefix(nullptr),
+        read_tier(kReadAllTier) {
+  }
+  ReadOptions(bool cksum, bool cache) :
+              verify_checksums(cksum), fill_cache(cache),
+              prefix_seek(false), snapshot(nullptr), prefix(nullptr),
+              read_tier(kReadAllTier) {
+  }
+};
+
+// Options that control write operations
+struct WriteOptions {
+  // If true, the write will be flushed from the operating system
+  // buffer cache (by calling WritableFile::Sync()) before the write
+  // is considered complete.  If this flag is true, writes will be
+  // slower.
+  //
+  // If this flag is false, and the machine crashes, some recent
+  // writes may be lost.  Note that if it is just the process that
+  // crashes (i.e., the machine does not reboot), no writes will be
+  // lost even if sync==false.
+  //
+  // In other words, a DB write with sync==false has similar
+  // crash semantics as the "write()" system call.  A DB write
+  // with sync==true has similar crash semantics to a "write()"
+  // system call followed by "fdatasync()".
+  //
+  // Default: false
+  bool sync;
+
+  // If true, writes will not first go to the write ahead log,
+  // and the write may got lost after a crash.
+  bool disableWAL;
+
+  WriteOptions()
+      : sync(false),
+        disableWAL(false) {
+  }
+};
+
+// Options that control flush operations
+struct FlushOptions {
+  // If true, the flush will wait until the flush is done.
+  // Default: true
+  bool wait;
+
+  FlushOptions()
+      : wait(true) {
+  }
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
+#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
+
+#include <stdint.h>
+
+namespace rocksdb {
+
+enum PerfLevel {
+  kDisable        = 0,  // disable perf stats
+  kEnableCount    = 1,  // enable only count stats
+  kEnableTime     = 2   // enable time stats too
+};
+
+// set the perf stats level
+void SetPerfLevel(PerfLevel level);
+
+// A thread local context for gathering performance counter efficiently
+// and transparently.
+
+struct PerfContext {
+
+  void Reset(); // reset all performance counters to zero
+
+  uint64_t user_key_comparison_count; // total number of user key comparisons
+  uint64_t block_cache_hit_count;     // total number of block cache hits
+  uint64_t block_read_count;          // total number of block reads (with IO)
+  uint64_t block_read_byte;           // total number of bytes from block reads
+  uint64_t block_read_time;           // total time spent on block reads
+  uint64_t block_checksum_time;       // total time spent on block checksum
+  uint64_t block_decompress_time;     // total time spent on block decompression
+  // total number of internal keys skipped over during iteration (overwritten or
+  // deleted, to be more specific, hidden by a put or delete of the same key)
+  uint64_t internal_key_skipped_count;
+  // total number of deletes skipped over during iteration
+  uint64_t internal_delete_skipped_count;
+  uint64_t wal_write_time;            // total time spent on writing to WAL
+};
+
+extern __thread PerfContext perf_context;
+
+}
+
+#endif
--- a/include/rocksdb/slice.h
+++ b/include/rocksdb/slice.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size.  The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a Slice without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Slice must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include <string>
+
+namespace rocksdb {
+
+class Slice {
+ public:
+  // Create an empty slice.
+  Slice() : data_(""), size_(0) { }
+
+  // Create a slice that refers to d[0,n-1].
+  Slice(const char* d, size_t n) : data_(d), size_(n) { }
+
+  // Create a slice that refers to the contents of "s"
+  /* implicit */
+  Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
+
+  // Create a slice that refers to s[0,strlen(s)-1]
+  /* implicit */
+  Slice(const char* s) : data_(s), size_(strlen(s)) { }
+
+  // Return a pointer to the beginning of the referenced data
+  const char* data() const { return data_; }
+
+  // Return the length (in bytes) of the referenced data
+  size_t size() const { return size_; }
+
+  // Return true iff the length of the referenced data is zero
+  bool empty() const { return size_ == 0; }
+
+  // Return the ith byte in the referenced data.
+  // REQUIRES: n < size()
+  char operator[](size_t n) const {
+    assert(n < size());
+    return data_[n];
+  }
+
+  // Change this slice to refer to an empty array
+  void clear() { data_ = ""; size_ = 0; }
+
+  // Drop the first "n" bytes from this slice.
+  void remove_prefix(size_t n) {
+    assert(n <= size());
+    data_ += n;
+    size_ -= n;
+  }
+
+  // Return a string that contains the copy of the referenced data.
+  std::string ToString(bool hex = false) const {
+    if (hex) {
+      std::string result;
+      char buf[10];
+      for (size_t i = 0; i < size_; i++) {
+        snprintf(buf, 10, "%02X", (unsigned char)data_[i]);
+        result += buf;
+      }
+      return result;
+    } else {
+      return std::string(data_, size_);
+    }
+  }
+
+  // Three-way comparison.  Returns value:
+  //   <  0 iff "*this" <  "b",
+  //   == 0 iff "*this" == "b",
+  //   >  0 iff "*this" >  "b"
+  int compare(const Slice& b) const;
+
+  // Return true iff "x" is a prefix of "*this"
+  bool starts_with(const Slice& x) const {
+    return ((size_ >= x.size_) &&
+            (memcmp(data_, x.data_, x.size_) == 0));
+  }
+
+ // private: make these public for rocksdbjni access
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+// A set of Slices that are virtually concatenated together.  'parts' points
+// to an array of Slices.  The number of elements in the array is 'num_parts'.
+struct SliceParts {
+  SliceParts(const Slice* parts, int num_parts) :
+      parts(parts), num_parts(num_parts) { }
+
+  const Slice* parts;
+  int num_parts;
+};
+
+inline bool operator==(const Slice& x, const Slice& y) {
+  return ((x.size() == y.size()) &&
+          (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) {
+  return !(x == y);
+}
+
+inline int Slice::compare(const Slice& b) const {
+  const int min_len = (size_ < b.size_) ? size_ : b.size_;
+  int r = memcmp(data_, b.data_, min_len);
+  if (r == 0) {
+    if (size_ < b.size_) r = -1;
+    else if (size_ > b.size_) r = +1;
+  }
+  return r;
+}
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_H_
--- a/include/rocksdb/slice_transform.h
+++ b/include/rocksdb/slice_transform.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Class for specifying user-defined functions which perform a
+// transformation on a slice.  It is not required that every slice
+// belong to the domain and/or range of a function.  Subclasses should
+// define InDomain and InRange to determine which slices are in either
+// of these sets respectively.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+class SliceTransform {
+ public:
+  virtual ~SliceTransform() {};
+
+  // Return the name of this transformation.
+  virtual const char* Name() const = 0;
+
+  // transform a src in domain to a dst in the range
+  virtual Slice Transform(const Slice& src) const = 0;
+
+  // determine whether this is a valid src upon the function applies
+  virtual bool InDomain(const Slice& src) const = 0;
+
+  // determine whether dst=Transform(src) for some src
+  virtual bool InRange(const Slice& dst) const = 0;
+};
+
+extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
+
+extern const SliceTransform* NewNoopTransform();
+
+}
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -0,0 +1,302 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
+#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <memory>
+#include <vector>
+
+namespace rocksdb {
+
+/**
+ * Keep adding ticker's here.
+ * Any ticker should have a value less than TICKER_ENUM_MAX.
+ * Add a new ticker by assigning it the current value of TICKER_ENUM_MAX
+ * Add a string representation in TickersNameMap below.
+ * And incrementing TICKER_ENUM_MAX.
+ */
+enum Tickers {
+  // total block cache misses
+  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+  //                               BLOCK_CACHE_FILTER_MISS +
+  //                               BLOCK_CACHE_DATA_MISS;
+  BLOCK_CACHE_MISS,
+  // total block cache hit
+  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+  //                              BLOCK_CACHE_FILTER_HIT +
+  //                              BLOCK_CACHE_DATA_HIT;
+  BLOCK_CACHE_HIT,
+  // # of blocks added to block cache.
+  BLOCK_CACHE_ADD,
+  // # of times cache miss when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_MISS,
+  // # of times cache hit when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_HIT,
+  // # of times cache miss when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_MISS,
+  // # of times cache hit when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_HIT,
+  // # of times cache miss when accessing data block from block cache.
+  BLOCK_CACHE_DATA_MISS,
+  // # of times cache hit when accessing data block from block cache.
+  BLOCK_CACHE_DATA_HIT,
+  // # of times bloom filter has avoided file reads.
+  BLOOM_FILTER_USEFUL,
+
+  /**
+   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+   * There are 3 reasons currently.
+   */
+  COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
+  COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
+  COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
+
+  // Number of keys written to the database via the Put and Write call's
+  NUMBER_KEYS_WRITTEN,
+  // Number of Keys read,
+  NUMBER_KEYS_READ,
+  // Number keys updated, if inplace update is enabled
+  NUMBER_KEYS_UPDATED,
+  // Bytes written / read
+  BYTES_WRITTEN,
+  BYTES_READ,
+  NO_FILE_CLOSES,
+  NO_FILE_OPENS,
+  NO_FILE_ERRORS,
+  // Time system had to wait to do LO-L1 compactions
+  STALL_L0_SLOWDOWN_MICROS,
+  // Time system had to wait to move memtable to L1.
+  STALL_MEMTABLE_COMPACTION_MICROS,
+  // write throttle because of too many files in L0
+  STALL_L0_NUM_FILES_MICROS,
+  RATE_LIMIT_DELAY_MILLIS,
+
+  NO_ITERATORS, // number of iterators currently open
+
+  // Number of MultiGet calls, keys read, and bytes read
+  NUMBER_MULTIGET_CALLS,
+  NUMBER_MULTIGET_KEYS_READ,
+  NUMBER_MULTIGET_BYTES_READ,
+
+  // Number of deletes records that were not required to be
+  // written to storage because key does not exist
+  NUMBER_FILTERED_DELETES,
+  NUMBER_MERGE_FAILURES,
+  SEQUENCE_NUMBER,
+
+  // number of times bloom was checked before creating iterator on a
+  // file, and the number of times the check was useful in avoiding
+  // iterator creation (and thus likely IOPs).
+  BLOOM_FILTER_PREFIX_CHECKED,
+  BLOOM_FILTER_PREFIX_USEFUL,
+
+  // Number of times we had to reseek inside an iteration to skip
+  // over large number of keys with same userkey.
+  NUMBER_OF_RESEEKS_IN_ITERATION,
+
+  // Record the number of calls to GetUpadtesSince. Useful to keep track of
+  // transaction log iterator refreshes
+  GET_UPDATES_SINCE_CALLS,
+
+  BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
+  BLOCK_CACHE_COMPRESSED_HIT,  // hit in the compressed block cache
+
+  TICKER_ENUM_MAX
+};
+
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
+const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
+  { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
+  { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
+  { BLOCK_CACHE_ADD, "rocksdb.block.cache.add" },
+  { BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss" },
+  { BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit" },
+  { BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss" },
+  { BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit" },
+  { BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss" },
+  { BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit" },
+  { BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful" },
+  { COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new" },
+  { COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete" },
+  { COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user" },
+  { NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written" },
+  { NUMBER_KEYS_READ, "rocksdb.number.keys.read" },
+  { NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated" },
+  { BYTES_WRITTEN, "rocksdb.bytes.written" },
+  { BYTES_READ, "rocksdb.bytes.read" },
+  { NO_FILE_CLOSES, "rocksdb.no.file.closes" },
+  { NO_FILE_OPENS, "rocksdb.no.file.opens" },
+  { NO_FILE_ERRORS, "rocksdb.no.file.errors" },
+  { STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros" },
+  { STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros" },
+  { STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros" },
+  { RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis" },
+  { NO_ITERATORS, "rocksdb.num.iterators" },
+  { NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" },
+  { NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" },
+  { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" },
+  { NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" },
+  { NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
+  { SEQUENCE_NUMBER, "rocksdb.sequence.number" },
+  { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
+  { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
+  { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" },
+  { GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls" },
+  { BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss" },
+  { BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit" }
+};
+
+/**
+ * Keep adding histogram's here.
+ * Any histogram whould have value less than HISTOGRAM_ENUM_MAX
+ * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
+ * Add a string representation in HistogramsNameMap below
+ * And increment HISTOGRAM_ENUM_MAX
+ */
+enum Histograms {
+  DB_GET,
+  DB_WRITE,
+  COMPACTION_TIME,
+  TABLE_SYNC_MICROS,
+  COMPACTION_OUTFILE_SYNC_MICROS,
+  WAL_FILE_SYNC_MICROS,
+  MANIFEST_FILE_SYNC_MICROS,
+  // TIME SPENT IN IO DURING TABLE OPEN
+  TABLE_OPEN_IO_MICROS,
+  DB_MULTIGET,
+  READ_BLOCK_COMPACTION_MICROS,
+  READ_BLOCK_GET_MICROS,
+  WRITE_RAW_BLOCK_MICROS,
+
+  STALL_L0_SLOWDOWN_COUNT,
+  STALL_MEMTABLE_COMPACTION_COUNT,
+  STALL_L0_NUM_FILES_COUNT,
+  HARD_RATE_LIMIT_DELAY_COUNT,
+  SOFT_RATE_LIMIT_DELAY_COUNT,
+  NUM_FILES_IN_SINGLE_COMPACTION,
+  HISTOGRAM_ENUM_MAX,
+};
+
+const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
+  { DB_GET, "rocksdb.db.get.micros" },
+  { DB_WRITE, "rocksdb.db.write.micros" },
+  { COMPACTION_TIME, "rocksdb.compaction.times.micros" },
+  { TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" },
+  { COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" },
+  { WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" },
+  { MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" },
+  { TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" },
+  { DB_MULTIGET, "rocksdb.db.multiget.micros" },
+  { READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" },
+  { READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" },
+  { WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" },
+  { STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
+  { STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
+  { STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
+  { HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
+  { SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
+  { NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
+};
+
+struct HistogramData {
+  double median;
+  double percentile95;
+  double percentile99;
+  double average;
+  double standard_deviation;
+};
+
+
+class Histogram {
+ public:
+  // clear's the histogram
+  virtual void Clear() = 0;
+  virtual ~Histogram();
+  // Add a value to be recorded in the histogram.
+  virtual void Add(uint64_t value) = 0;
+
+  virtual std::string ToString() const = 0;
+
+  // Get statistics
+  virtual double Median() const = 0;
+  virtual double Percentile(double p) const = 0;
+  virtual double Average() const = 0;
+  virtual double StandardDeviation() const = 0;
+  virtual void Data(HistogramData * const data) const = 0;
+
+};
+
+/**
+ * A dumb ticker which keeps incrementing through its life time.
+ * Thread safe. Locking managed by implementation of this interface.
+ */
+class Ticker {
+ public:
+  Ticker() : count_(0) { }
+
+  inline void setTickerCount(uint64_t count) {
+    count_ = count;
+  }
+
+  inline void recordTick(int count = 1) {
+    count_ += count;
+  }
+
+  inline uint64_t getCount() {
+    return count_;
+  }
+
+ private:
+  std::atomic_uint_fast64_t count_;
+};
+
+// Analyze the performance of a db
+class Statistics {
+ public:
+
+  virtual long getTickerCount(Tickers tickerType) = 0;
+  virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;
+  virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
+  virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
+
+  virtual void histogramData(Histograms type, HistogramData * const data) = 0;
+  // String representation of the statistic object.
+  std::string ToString();
+};
+
+// Create a concrete DBStatistics object
+std::shared_ptr<Statistics> CreateDBStatistics();
+
+// Ease of Use functions
+inline void RecordTick(std::shared_ptr<Statistics> statistics,
+                       Tickers ticker,
+                       uint64_t count = 1) {
+  assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
+  assert(TickersNameMap.size() == TICKER_ENUM_MAX);
+  if (statistics) {
+    statistics->recordTick(ticker, count);
+  }
+}
+
+inline void SetTickerCount(std::shared_ptr<Statistics> statistics,
+                           Tickers ticker,
+                           uint64_t count) {
+  assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
+  assert(TickersNameMap.size() == TICKER_ENUM_MAX);
+  if (statistics) {
+    statistics->setTickerCount(ticker, count);
+  }
+}
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation.  It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_
+#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_
+
+#include <string>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class Status {
+ public:
+  // Create a success status.
+  Status() : state_(nullptr) { }
+  ~Status() { delete[] state_; }
+
+  // Copy the specified status.
+  Status(const Status& s);
+  void operator=(const Status& s);
+
+  // Return a success status.
+  static Status OK() { return Status(); }
+
+  // Return error status of an appropriate type.
+  static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotFound, msg, msg2);
+  }
+  static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kCorruption, msg, msg2);
+  }
+  static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotSupported, msg, msg2);
+  }
+  static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kInvalidArgument, msg, msg2);
+  }
+  static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, msg, msg2);
+  }
+  static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kMergeInProgress, msg, msg2);
+  }
+  static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIncomplete, msg, msg2);
+  }
+
+  // Returns true iff the status indicates success.
+  bool ok() const { return (state_ == nullptr); }
+
+  // Returns true iff the status indicates a NotFound error.
+  bool IsNotFound() const { return code() == kNotFound; }
+
+  // Returns true iff the status indicates a Corruption error.
+  bool IsCorruption() const { return code() == kCorruption; }
+
+  // Returns true iff the status indicates a NotSupported error.
+  bool IsNotSupported() const { return code() == kNotSupported; }
+
+  // Returns true iff the status indicates an InvalidArgument error.
+  bool IsInvalidArgument() const { return code() == kInvalidArgument; }
+
+  // Returns true iff the status indicates an IOError.
+  bool IsIOError() const { return code() == kIOError; }
+
+  // Returns true iff the status indicates an MergeInProgress.
+  bool IsMergeInProgress() const { return code() == kMergeInProgress; }
+
+  // Returns true iff the status indicates Incomplete
+  bool IsIncomplete() const { return code() == kIncomplete; }
+
+  // Return a string representation of this status suitable for printing.
+  // Returns the string "OK" for success.
+  std::string ToString() const;
+
+ private:
+  // OK status has a nullptr state_.  Otherwise, state_ is a new[] array
+  // of the following form:
+  //    state_[0..3] == length of message
+  //    state_[4]    == code
+  //    state_[5..]  == message
+  const char* state_;
+
+  enum Code {
+    kOk = 0,
+    kNotFound = 1,
+    kCorruption = 2,
+    kNotSupported = 3,
+    kInvalidArgument = 4,
+    kIOError = 5,
+    kMergeInProgress = 6,
+    kIncomplete = 7
+  };
+
+  Code code() const {
+    return (state_ == nullptr) ? kOk : static_cast<Code>(state_[4]);
+  }
+
+  Status(Code code, const Slice& msg, const Slice& msg2);
+  static const char* CopyState(const char* s);
+};
+
+inline Status::Status(const Status& s) {
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+}
+inline void Status::operator=(const Status& s) {
+  // The following condition catches both aliasing (when this == &s),
+  // and the common case where both s and *this are ok.
+  if (state_ != s.state_) {
+    delete[] state_;
+    state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+  }
+}
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_STATUS_H_
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -0,0 +1,180 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class WritableFile;
+
+using std::unique_ptr;
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+class TableBuilder {
+ public:
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  virtual ~TableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Add(const Slice& key, const Slice& value) = 0;
+
+  // Return non-ok iff some error has been detected.
+  virtual Status status() const = 0;
+
+  // Finish building the table.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual Status Finish() = 0;
+
+  // Indicate that the contents of this builder should be abandoned.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Abandon() = 0;
+
+  // Number of calls to Add() so far.
+  virtual uint64_t NumEntries() const = 0;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  virtual uint64_t FileSize() const = 0;
+};
+
+// A Table is a sorted map from strings to strings.  Tables are
+// immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization.
+class TableReader {
+ public:
+  virtual ~TableReader() {}
+
+  // Determine whether there is a chance that the current table file
+  // contains the key a key starting with iternal_prefix. The specific
+  // table implementation can use bloom filter and/or other heuristic
+  // to filter out this table as a whole.
+  virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  virtual Iterator* NewIterator(const ReadOptions&) = 0;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
+
+  // Returns true if the block for the specified key is in cache.
+  // REQUIRES: key is in this table.
+  virtual bool TEST_KeyInCache(const ReadOptions& options,
+                               const Slice& key) = 0;
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  virtual void SetupForCompaction() = 0;
+
+  virtual TableProperties& GetTableProperties() = 0;
+
+  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
+  // the entry found after a call to Seek(key), until result_handler returns
+  // false, where k is the actual internal key for a row found and v as the
+  // value of the key. didIO is true if I/O is involved in the operation. May
+  // not make such a call if filter policy says that key is not present.
+  //
+  // mark_key_may_exist_handler needs to be called when it is configured to be
+  // memory only and the key is not found in the block cache, with
+  // the parameter to be handle_context.
+  //
+  // readOptions is the options for the read
+  // key is the key to search for
+  virtual Status Get(
+      const ReadOptions& readOptions,
+      const Slice& key,
+      void* handle_context,
+      bool (*result_handler)(void* handle_context, const Slice& k,
+                             const Slice& v, bool didIO),
+      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
+};
+
+// A base class for table factories
+class TableFactory {
+ public:
+  virtual ~TableFactory() {}
+
+  // The type of the table.
+  //
+  // The client of this package should switch to a new name whenever
+  // the table format implementation changes.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Returns a Table object table that can fetch data from file specified
+  // in parameter file. It's the caller's responsibility to make sure
+  // file is in the correct format.
+  //
+  // GetTableReader() is called in two places:
+  // (1) TableCache::FindTable() calls the function when table cache miss
+  //     and cache the table object returned.
+  // (1) SstFileReader (for SST Dump) opens the table and dump the table
+  //     contents using the interator of the table.
+  // options and soptions are options. options is the general options.
+  // Multiple configured can be accessed from there, including and not
+  // limited to block cache and key comparators.
+  // file is a file handler to handle the file for the table
+  // file_size is the physical file size of the file
+  // table_reader is the output table reader
+  virtual Status GetTableReader(
+      const Options& options, const EnvOptions& soptions,
+      unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+      unique_ptr<TableReader>* table_reader) const = 0;
+
+  // Return a table builder to write to a file for this table type.
+  //
+  // It is called in several places:
+  // (1) When flushing memtable to a level-0 output file, it creates a table
+  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
+  // (2) During compaction, it gets the builder for writing compaction output
+  //     files in DBImpl::OpenCompactionOutputFile().
+  // (3) When recovering from transaction logs, it creates a table builder to
+  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
+  //     by calling BuildTable())
+  // (4) When running Repairer, it creates a table builder to convert logs to
+  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
+  //
+  // options is the general options. Multiple configured can be acceseed from
+  // there, including and not limited to compression options.
+  // file is a handle of a writable file. It is the caller's responsibility to
+  // keep the file open and close the file after closing the table builder.
+  // compression_type is the compression type to use in this table.
+  virtual TableBuilder* GetTableBuilder(
+      const Options& options, WritableFile* file,
+      CompressionType compression_type) const = 0;
+};
+}  // namespace rocksdb
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+// TableProperties contains a bunch of read-only properties of its associated
+// table.
+struct TableProperties {
+ public:
+  // Other than basic table properties, each table may also have the user
+  // collected properties.
+  // The value of the user-collected properties are encoded as raw bytes --
+  // users have to interprete these values by themselves.
+  typedef
+    std::unordered_map<std::string, std::string>
+    UserCollectedProperties;
+
+  // the total size of all data blocks.
+  uint64_t data_size = 0;
+  // the size of index block.
+  uint64_t index_size = 0;
+  // the size of filter block.
+  uint64_t filter_size = 0;
+  // total raw key size
+  uint64_t raw_key_size = 0;
+  // total raw value size
+  uint64_t raw_value_size = 0;
+  // the number of blocks in this table
+  uint64_t num_data_blocks = 0;
+  // the number of entries in this table
+  uint64_t num_entries = 0;
+
+  // The name of the filter policy used in this table.
+  // If no filter policy is used, `filter_policy_name` will be an empty string.
+  std::string filter_policy_name;
+
+  // user collected properties
+  UserCollectedProperties user_collected_properties;
+
+  // convert this object to a human readable form
+  //   @prop_delim: delimiter for each property.
+  std::string ToString(
+      const std::string& prop_delim = "; ",
+      const std::string& kv_delim = "=") const;
+};
+
+// `TablePropertiesCollector` provides the mechanism for users to collect
+// their own interested properties. This class is essentially a collection
+//  of callback functions that will be invoked during table building.
+class TablePropertiesCollector {
+ public:
+  virtual ~TablePropertiesCollector() { }
+
+  // Add() will be called when a new key/value pair is inserted into the table.
+  // @params key    the original key that is inserted into the table.
+  // @params value  the original value that is inserted into the table.
+  virtual Status Add(const Slice& key, const Slice& value) = 0;
+
+  // Finish() will be called when a table has already been built and is ready
+  // for writing the properties block.
+  // @params properties  User will add their collected statistics to
+  // `properties`.
+  virtual Status Finish(
+      TableProperties::UserCollectedProperties* properties) = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+
+  // Return the human-readable properties, where the key is property name and
+  // the value is the human-readable form of value.
+  virtual TableProperties::UserCollectedProperties
+    GetReadableProperties() const = 0;
+};
+
+// Extra properties
+// Below is a list of non-basic properties that are collected by database
+// itself. Especially some properties regarding to the internal keys (which
+// is unknown to `table`).
+extern uint64_t GetDeletedKeys(
+    const TableProperties::UserCollectedProperties& props);
+
+}  // namespace rocksdb
--- a/include/rocksdb/transaction_log.h
+++ b/include/rocksdb/transaction_log.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include <memory>
+#include <vector>
+
+namespace rocksdb {
+
+class LogFile;
+typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
+
+enum  WalFileType {
+  /* Indicates that WAL file is in archive directory. WAL files are moved from
+   * the main db directory to archive directory once they are not live and stay
+   * there until cleaned up. Files are cleaned depending on archive size
+   * (Options::WAL_size_limit_MB) and time since last cleaning
+   * (Options::WAL_ttl_seconds).
+   */
+  kArchivedLogFile = 0,
+
+  /* Indicates that WAL file is live and resides in the main db directory */
+  kAliveLogFile = 1
+} ;
+
+class LogFile {
+ public:
+  LogFile() {}
+  virtual ~LogFile() {}
+
+  // Returns log file's pathname relative to the main db dir
+  // Eg. For a live-log-file = /000003.log
+  //     For an archived-log-file = /archive/000003.log
+  virtual std::string PathName() const = 0;
+
+
+  // Primary identifier for log file.
+  // This is directly proportional to creation time of the log file
+  virtual uint64_t LogNumber() const = 0;
+
+  // Log file can be either alive or archived
+  virtual WalFileType Type() const = 0;
+
+  // Starting sequence number of writebatch written in this log file
+  virtual SequenceNumber StartSequence() const = 0;
+
+  // Size of log file on disk in Bytes
+  virtual uint64_t SizeFileBytes() const = 0;
+};
+
+struct BatchResult {
+  SequenceNumber sequence = SequenceNumber();
+  std::unique_ptr<WriteBatch> writeBatchPtr;
+};
+
+// A TransactionLogIterator is used to iterate over the transactions in a db.
+// One run of the iterator is continuous, i.e. the iterator will stop at the
+// beginning of any gap in sequences
+class TransactionLogIterator {
+ public:
+  TransactionLogIterator() {}
+  virtual ~TransactionLogIterator() {}
+
+  // An iterator is either positioned at a WriteBatch or not valid.
+  // This method returns true if the iterator is valid.
+  // Can read data from a valid iterator.
+  virtual bool Valid() = 0;
+
+  // Moves the iterator to the next WriteBatch.
+  // REQUIRES: Valid() to be true.
+  virtual void Next() = 0;
+
+  // Returns ok if the iterator is valid.
+  // Returns the Error when something has gone wrong.
+  virtual Status status() = 0;
+
+  // If valid return's the current write_batch and the sequence number of the
+  // earliest transaction contained in the batch.
+  // ONLY use if Valid() is true and status() is OK.
+  virtual BatchResult GetBatch() = 0;
+};
+} //  namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
--- a/include/rocksdb/types.h
+++ b/include/rocksdb/types.h
@@ -0,0 +1,20 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+
+#include <stdint.h>
+
+namespace rocksdb {
+
+// Define all public custom types here.
+
+// Represents a sequence number in a WAL file.
+typedef uint64_t SequenceNumber;
+
+}  //  namespace rocksdb
+
+#endif //  STORAGE_ROCKSDB_INCLUDE_TYPES_H_
--- a/include/rocksdb/universal_compaction.h
+++ b/include/rocksdb/universal_compaction.h
@@ -0,0 +1,89 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
+#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
+
+#include <stddef.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+#include <climits>
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+
+namespace rocksdb {
+
+//
+// Algorithm used to make a compaction request stop picking new files
+// into a single compaction run
+//
+enum CompactionStopStyle {
+  kCompactionStopStyleSimilarSize, // pick files of similar size
+  kCompactionStopStyleTotalSize    // total size of picked files > next file
+};
+
+class CompactionOptionsUniversal {
+ public:
+
+  // Percentage flexibilty while comparing file size. If the candidate file(s)
+  // size is 1% smaller than the next file's size, then include next file into
+  // this candidate set. // Default: 1
+  unsigned int size_ratio;
+
+  // The minimum number of files in a single compaction run. Default: 2
+  unsigned int min_merge_width;
+
+  // The maximum number of files in a single compaction run. Default: UINT_MAX
+  unsigned int max_merge_width;
+
+  // The size amplification is defined as the amount (in percentage) of
+  // additional storage needed to store a single byte of data in the database.
+  // For example, a size amplification of 2% means that a database that
+  // contains 100 bytes of user-data may occupy upto 102 bytes of
+  // physical storage. By this definition, a fully compacted database has
+  // a size amplification of 0%. Rocksdb uses the following heuristic
+  // to calculate size amplification: it assumes that all files excluding
+  // the earliest file contribute to the size amplification.
+  // Default: 200, which means that a 100 byte database could require upto
+  // 300 bytes of storage.
+  unsigned int max_size_amplification_percent;
+
+  // If this option is set to be -1 (the default value), all the output files
+  // will follow compression type specified.
+  //
+  // If this option is not negative, we will try to make sure compressed
+  // size is just above this value. In normal cases, at least this percentage
+  // of data will be compressed.
+  // When we are compacting to a new file, here is the criteria whether
+  // it needs to be compressed: assuming here are the list of files sorted
+  // by generation time:
+  //    A1...An B1...Bm C1...Ct
+  // where A1 is the newest and Ct is the oldest, and we are going to compact
+  // B1...Bm, we calculate the total size of all the files as total_size, as
+  // well as  the total size of C1...Ct as total_C, the compaction output file
+  // will be compressed iff
+  //   total_C / total_size < this percentage
+  int compression_size_percent;
+
+  // The algorithm used to stop picking files into a single compaction run
+  // Default: kCompactionStopStyleTotalSize
+  CompactionStopStyle stop_style;
+
+  // Default set of parameters
+  CompactionOptionsUniversal() :
+    size_ratio(1),
+    min_merge_width(2),
+    max_merge_width(UINT_MAX),
+    max_size_amplification_percent(200),
+    compression_size_percent(-1),
+    stop_style(kCompactionStopStyleTotalSize) {
+  }
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch.  For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+//    batch.Put("key", "v1");
+//    batch.Delete("key");
+//    batch.Put("key", "v2");
+//    batch.Put("key", "v3");
+//
+// Multiple threads can invoke const methods on a WriteBatch without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same WriteBatch must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
+#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
+
+#include <string>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Slice;
+struct SliceParts;
+
+class WriteBatch {
+ public:
+  WriteBatch();
+  ~WriteBatch();
+
+  // Store the mapping "key->value" in the database.
+  void Put(const Slice& key, const Slice& value);
+
+  // Variant of Put() that gathers output like writev(2).  The key and value
+  // that will be written to the database are concatentations of arrays of
+  // slices.
+  void Put(const SliceParts& key, const SliceParts& value);
+
+  // Merge "value" with the existing value of "key" in the database.
+  // "key->merge(existing, value)"
+  void Merge(const Slice& key, const Slice& value);
+
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  void Delete(const Slice& key);
+
+  // Append a blob of arbitrary size to the records in this batch. The blob will
+  // be stored in the transaction log but not in any other file. In particular,
+  // it will not be persisted to the SST files. When iterating over this
+  // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+  // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+  // encountered in the same order in thich they were inserted. The blob will
+  // NOT consume sequence number(s) and will NOT increase the count of the batch
+  //
+  // Example application: add timestamps to the transaction log for use in
+  // replication.
+  void PutLogData(const Slice& blob);
+
+  // Clear all updates buffered in this batch.
+  void Clear();
+
+  // Support for iterating over the contents of a batch.
+  class Handler {
+   public:
+    virtual ~Handler();
+    virtual void Put(const Slice& key, const Slice& value) = 0;
+    // Merge and LogData are not pure virtual. Otherwise, we would break
+    // existing clients of Handler on a source code level. The default
+    // implementation of Merge simply throws a runtime exception.
+    virtual void Merge(const Slice& key, const Slice& value);
+    // The default implementation of LogData does nothing.
+    virtual void LogData(const Slice& blob);
+    virtual void Delete(const Slice& key) = 0;
+    // Continue is called by WriteBatch::Iterate. If it returns false,
+    // iteration is halted. Otherwise, it continues iterating. The default
+    // implementation always returns true.
+    virtual bool Continue();
+  };
+  Status Iterate(Handler* handler) const;
+
+  // Retrieve the serialized version of this batch.
+  std::string Data() { return rep_; }
+
+  // Returns the number of updates in the batch
+  int Count() const;
+
+  // Constructor with a serialized string object
+  explicit WriteBatch(std::string rep): rep_(rep) {}
+
+ private:
+  friend class WriteBatchInternal;
+
+  std::string rep_;  // See comment in write_batch.cc for the format of rep_
+
+  // Intentionally copyable
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
--- a/include/utilities/stackable_db.h
+++ b/include/utilities/stackable_db.h
@@ -0,0 +1,161 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
+class StackableDB : public DB {
+ public:
+  explicit StackableDB(StackableDB* sdb) : sdb_(sdb) {}
+
+  // Returns the DB object that is the lowermost component in the stack of DBs
+  virtual DB* GetRawDB() {
+    return sdb_->GetRawDB();
+  }
+
+  // convert a DB to StackableDB
+  // TODO: This function does not work yet. Passing nullptr to StackableDB in
+  //       NewStackableDB's constructor will cause segfault on object's usage
+  static StackableDB* DBToStackableDB(DB* db) {
+    class NewStackableDB : public StackableDB {
+     public:
+      NewStackableDB(DB* db)
+        : StackableDB(nullptr),
+          db_(db) {}
+
+      DB* GetRawDB() {
+        return db_;
+      }
+
+     private:
+      DB* db_;
+    };
+    return new NewStackableDB(db);
+  }
+
+  virtual Status Put(const WriteOptions& options,
+                     const Slice& key,
+                     const Slice& val) override {
+    return sdb_->Put(options, key, val);
+  }
+
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     std::string* value) override {
+    return sdb_->Get(options, key, value);
+  }
+
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values)
+    override {
+      return sdb_->MultiGet(options, keys, values);
+  }
+
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override {
+    return sdb_->KeyMayExist(options, key, value, value_found);
+  }
+
+  virtual Status Delete(const WriteOptions& wopts, const Slice& key) override {
+    return sdb_->Delete(wopts, key);
+  }
+
+  virtual Status Merge(const WriteOptions& options,
+                       const Slice& key,
+                       const Slice& value) override {
+    return sdb_->Merge(options, key, value);
+  }
+
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
+    override {
+      return sdb_->Write(opts, updates);
+  }
+
+  virtual Iterator* NewIterator(const ReadOptions& opts) override {
+    return sdb_->NewIterator(opts);
+  }
+
+  virtual const Snapshot* GetSnapshot() override {
+    return sdb_->GetSnapshot();
+  }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+    return sdb_->ReleaseSnapshot(snapshot);
+  }
+
+  virtual bool GetProperty(const Slice& property, std::string* value)
+    override {
+      return sdb_->GetProperty(property, value);
+  }
+
+  virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes)
+    override {
+      return sdb_->GetApproximateSizes(r, n, sizes);
+  }
+
+  virtual void CompactRange(const Slice* begin, const Slice* end,
+                            bool reduce_level = false,
+                            int target_level = -1) override {
+    return sdb_->CompactRange(begin, end, reduce_level, target_level);
+  }
+
+  virtual int NumberLevels() override {
+    return sdb_->NumberLevels();
+  }
+
+  virtual int MaxMemCompactionLevel() override {
+    return sdb_->MaxMemCompactionLevel();
+  }
+
+  virtual int Level0StopWriteTrigger() override {
+    return sdb_->Level0StopWriteTrigger();
+  }
+
+  virtual Status Flush(const FlushOptions& fopts) override {
+    return sdb_->Flush(fopts);
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return sdb_->DisableFileDeletions();
+  }
+
+  virtual Status EnableFileDeletions() override {
+    return sdb_->EnableFileDeletions();
+  }
+
+  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+                              bool flush_memtable = true) override {
+      return sdb_->GetLiveFiles(vec, mfs, flush_memtable);
+  }
+
+  virtual SequenceNumber GetLatestSequenceNumber() const override {
+    return sdb_->GetLatestSequenceNumber();
+  }
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+    return sdb_->GetSortedWalFiles(files);
+  }
+
+  virtual Status DeleteFile(std::string name) override {
+    return sdb_->DeleteFile(name);
+  }
+
+  virtual Status GetUpdatesSince(SequenceNumber seq_number,
+                                 unique_ptr<TransactionLogIterator>* iter)
+    override {
+      return sdb_->GetUpdatesSince(seq_number, iter);
+  }
+
+ protected:
+  StackableDB* sdb_;
+};
+
+} //  namespace rocksdb
--- a/include/utilities/utility_db.h
+++ b/include/utilities/utility_db.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "stackable_db.h"
+
+namespace rocksdb {
+
+// This class contains APIs to open rocksdb with specific support eg. TTL
+class UtilityDB {
+
+  public:
+    // Open the database with TTL support.
+    //
+    // USE-CASES:
+    // This API should be used to open the db when key-values inserted are
+    //  meant to be removed from the db in a non-strict 'ttl' amount of time
+    //  Therefore, this guarantees that key-values inserted will remain in the
+    //  db for >= ttl amount of time and the db will make efforts to remove the
+    //  key-values as soon as possible after ttl seconds of their insertion.
+    //
+    // BEHAVIOUR:
+    // TTL is accepted in seconds
+    // (int32_t)Timestamp(creation) is suffixed to values in Put internally
+    // Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
+    // Get/Iterator may return expired entries(compaction not run on them yet)
+    // Different TTL may be used during different Opens
+    // Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
+    //          Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
+    // read_only=true opens in the usual read-only mode. Compactions will not be
+    //  triggered(neither manual nor automatic), so no expired entries removed
+    //
+    // CONSTRAINTS:
+    // Not specifying/passing or non-positive TTL behaves like TTL = infinity
+    //
+    // !!!WARNING!!!:
+    // Calling DB::Open directly to re-open a db created by this API will get
+    //  corrupt values(timestamp suffixed) and no ttl effect will be there
+    //  during the second Open, so use this API consistently to open the db
+    // Be careful when passing ttl with a small positive value because the
+    //  whole database may be deleted in a small amount of time
+    static Status OpenTtlDB(const Options& options,
+                            const std::string& name,
+                            StackableDB** dbptr,
+                            int32_t ttl = 0,
+                            bool read_only = false);
+};
+
+} //  namespace rocksdb