Squashed 'src/rocksdb/' content from commit 457bae6

git-subtree-dir: src/rocksdb git-subtree-split: 457bae6911
2026-04-29 15:37:57 +00:00 · 2014-06-04 16:10:38 -07:00
commit 8514b88974
379 changed files with 108214 additions and 0 deletions
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -0,0 +1,575 @@
+/*  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+  This source code is licensed under the BSD-style license found in the
+  LICENSE file in the root directory of this source tree. An additional grant
+  of patent rights can be found in the PATENTS file in the same directory.
+ Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+  Use of this source code is governed by a BSD-style license that can be
+  found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+  C bindings for leveldb.  May be useful as a stable ABI that can be
+  used by programs that keep leveldb in a shared library, or for
+  a JNI api.
+
+  Does not support:
+  . getters for the option types
+  . custom comparators that implement key shortening
+  . capturing post-write-snapshot
+  . custom iter, db, env, cache implementations using just the C bindings
+
+  Some conventions:
+
+  (1) We expose just opaque struct pointers and functions to clients.
+  This allows us to change internal representations without having to
+  recompile clients.
+
+  (2) For simplicity, there is no equivalent to the Slice type.  Instead,
+  the caller has to pass the pointer and length as separate
+  arguments.
+
+  (3) Errors are represented by a null-terminated c string.  NULL
+  means no error.  All operations that can raise an error are passed
+  a "char** errptr" as the last argument.  One of the following must
+  be true on entry:
+     *errptr == NULL
+     *errptr points to a malloc()ed null-terminated error message
+  On success, a leveldb routine leaves *errptr unchanged.
+  On failure, leveldb frees the old value of *errptr and
+  set *errptr to a malloc()ed error message.
+
+  (4) Bools have the type unsigned char (0 == false; rest == true)
+
+  (5) All of the pointer arguments must be non-NULL.
+*/
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
+#define STORAGE_ROCKSDB_INCLUDE_C_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Exported types */
+
+typedef struct rocksdb_t                 rocksdb_t;
+typedef struct rocksdb_cache_t           rocksdb_cache_t;
+typedef struct rocksdb_comparator_t      rocksdb_comparator_t;
+typedef struct rocksdb_env_t             rocksdb_env_t;
+typedef struct rocksdb_filelock_t        rocksdb_filelock_t;
+typedef struct rocksdb_filterpolicy_t    rocksdb_filterpolicy_t;
+typedef struct rocksdb_flushoptions_t    rocksdb_flushoptions_t;
+typedef struct rocksdb_iterator_t        rocksdb_iterator_t;
+typedef struct rocksdb_logger_t          rocksdb_logger_t;
+typedef struct rocksdb_mergeoperator_t   rocksdb_mergeoperator_t;
+typedef struct rocksdb_options_t         rocksdb_options_t;
+typedef struct rocksdb_randomfile_t      rocksdb_randomfile_t;
+typedef struct rocksdb_readoptions_t     rocksdb_readoptions_t;
+typedef struct rocksdb_seqfile_t         rocksdb_seqfile_t;
+typedef struct rocksdb_slicetransform_t  rocksdb_slicetransform_t;
+typedef struct rocksdb_snapshot_t        rocksdb_snapshot_t;
+typedef struct rocksdb_writablefile_t    rocksdb_writablefile_t;
+typedef struct rocksdb_writebatch_t      rocksdb_writebatch_t;
+typedef struct rocksdb_writeoptions_t    rocksdb_writeoptions_t;
+typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t;
+typedef struct rocksdb_livefiles_t     rocksdb_livefiles_t;
+
+/* DB operations */
+
+extern rocksdb_t* rocksdb_open(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr);
+
+extern rocksdb_t* rocksdb_open_for_read_only(
+    const rocksdb_options_t* options,
+    const char* name,
+    unsigned char error_if_log_file_exist,
+    char** errptr);
+
+extern void rocksdb_close(rocksdb_t* db);
+
+extern void rocksdb_put(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr);
+
+extern void rocksdb_delete(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    char** errptr);
+
+extern void rocksdb_merge(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr);
+
+extern void rocksdb_write(
+    rocksdb_t* db,
+    const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch,
+    char** errptr);
+
+/* Returns NULL if not found.  A malloc()ed array otherwise.
+   Stores the length of the array in *vallen. */
+extern char* rocksdb_get(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr);
+
+extern rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options);
+
+extern const rocksdb_snapshot_t* rocksdb_create_snapshot(
+    rocksdb_t* db);
+
+extern void rocksdb_release_snapshot(
+    rocksdb_t* db,
+    const rocksdb_snapshot_t* snapshot);
+
+/* Returns NULL if property name is unknown.
+   Else returns a pointer to a malloc()-ed null-terminated value. */
+extern char* rocksdb_property_value(
+    rocksdb_t* db,
+    const char* propname);
+
+extern void rocksdb_approximate_sizes(
+    rocksdb_t* db,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes);
+
+extern void rocksdb_compact_range(
+    rocksdb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+extern void rocksdb_delete_file(
+    rocksdb_t* db,
+    const char* name);
+
+extern const rocksdb_livefiles_t* rocksdb_livefiles(
+    rocksdb_t* db);
+
+extern void rocksdb_flush(
+    rocksdb_t* db,
+    const rocksdb_flushoptions_t* options,
+    char** errptr);
+
+extern void rocksdb_disable_file_deletions(
+    rocksdb_t* db,
+    char** errptr);
+
+extern void rocksdb_enable_file_deletions(
+    rocksdb_t* db,
+    unsigned char force,
+    char** errptr);
+
+/* Management operations */
+
+extern void rocksdb_destroy_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr);
+
+extern void rocksdb_repair_db(
+    const rocksdb_options_t* options,
+    const char* name,
+    char** errptr);
+
+/* Iterator */
+
+extern void rocksdb_iter_destroy(rocksdb_iterator_t*);
+extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*);
+extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
+extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
+extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen);
+extern void rocksdb_iter_next(rocksdb_iterator_t*);
+extern void rocksdb_iter_prev(rocksdb_iterator_t*);
+extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen);
+extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen);
+extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr);
+
+/* Write batch */
+
+extern rocksdb_writebatch_t* rocksdb_writebatch_create();
+extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*);
+extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
+extern int rocksdb_writebatch_count(rocksdb_writebatch_t*);
+extern void rocksdb_writebatch_put(
+    rocksdb_writebatch_t*,
+    const char* key, size_t klen,
+    const char* val, size_t vlen);
+extern void rocksdb_writebatch_merge(
+    rocksdb_writebatch_t*,
+    const char* key, size_t klen,
+    const char* val, size_t vlen);
+extern void rocksdb_writebatch_delete(
+    rocksdb_writebatch_t*,
+    const char* key, size_t klen);
+extern void rocksdb_writebatch_iterate(
+    rocksdb_writebatch_t*,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen));
+extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size);
+
+/* Options */
+
+extern rocksdb_options_t* rocksdb_options_create();
+extern void rocksdb_options_destroy(rocksdb_options_t*);
+extern void rocksdb_options_set_comparator(
+    rocksdb_options_t*,
+    rocksdb_comparator_t*);
+extern void rocksdb_options_set_merge_operator(rocksdb_options_t*,
+                                               rocksdb_mergeoperator_t*);
+extern void rocksdb_options_set_compression_per_level(
+  rocksdb_options_t* opt,
+  int* level_values,
+  size_t num_levels);
+extern void rocksdb_options_set_filter_policy(
+    rocksdb_options_t*,
+    rocksdb_filterpolicy_t*);
+extern void rocksdb_options_set_create_if_missing(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_error_if_exists(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_paranoid_checks(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*);
+extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
+extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
+extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
+extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
+extern void rocksdb_options_set_cache_compressed(rocksdb_options_t*, rocksdb_cache_t*);
+extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int);
+extern void rocksdb_options_set_compression_options(
+    rocksdb_options_t*, int, int, int);
+extern void rocksdb_options_set_whole_key_filtering(rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_prefix_extractor(
+    rocksdb_options_t*, rocksdb_slicetransform_t*);
+extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int);
+extern void rocksdb_options_set_level0_file_num_compaction_trigger(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_level0_slowdown_writes_trigger(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_level0_stop_writes_trigger(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_mem_compaction_level(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_target_file_size_base(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_target_file_size_multiplier(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_bytes_for_level_base(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_max_bytes_for_level_multiplier(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_expanded_compaction_factor(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_grandparent_overlap_factor(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+    rocksdb_options_t*, int* level_values, size_t num_levels);
+extern void rocksdb_options_enable_statistics(rocksdb_options_t*);
+
+extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int);
+extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_log_file_size(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_keep_log_file_num(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_soft_rate_limit(rocksdb_options_t*, double);
+extern void rocksdb_options_set_hard_rate_limit(rocksdb_options_t*, double);
+extern void rocksdb_options_set_rate_limit_delay_max_milliseconds(
+    rocksdb_options_t*, unsigned int);
+extern void rocksdb_options_set_max_manifest_file_size(
+    rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_no_block_cache(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_table_cache_numshardbits(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_table_cache_remove_scan_count_limit(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_arena_block_size(
+    rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_use_fsync(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_db_stats_log_interval(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_db_log_dir(
+    rocksdb_options_t*, const char*);
+extern void rocksdb_options_set_wal_dir(
+    rocksdb_options_t*, const char*);
+extern void rocksdb_options_set_WAL_ttl_seconds(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_WAL_size_limit_MB(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_manifest_preallocation_size(
+    rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_purge_redundant_kvs_while_flush(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_allow_os_buffer(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_allow_mmap_reads(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_allow_mmap_writes(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_is_fd_close_on_exec(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_skip_log_error_on_recovery(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_stats_dump_period_sec(
+    rocksdb_options_t*, unsigned int);
+extern void rocksdb_options_set_block_size_deviation(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_advise_random_on_open(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_access_hint_on_compaction_start(
+    rocksdb_options_t*, int);
+extern void rocksdb_options_set_use_adaptive_mutex(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_bytes_per_sync(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_verify_checksums_in_compaction(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_filter_deletes(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_max_sequential_skip_in_iterations(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int);
+extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int);
+extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int);
+extern void rocksdb_options_set_delete_obsolete_files_period_micros(
+    rocksdb_options_t*, uint64_t);
+extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int);
+extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*);
+extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*);
+extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t);
+extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t);
+
+extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n);
+extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec);
+
+extern void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level);
+
+extern void rocksdb_options_set_memtable_prefix_bloom_bits(
+    rocksdb_options_t*, uint32_t);
+extern void rocksdb_options_set_memtable_prefix_bloom_probes(
+    rocksdb_options_t*, uint32_t);
+extern void rocksdb_options_set_max_successive_merges(
+    rocksdb_options_t*, size_t);
+extern void rocksdb_options_set_min_partial_merge_operands(
+    rocksdb_options_t*, uint32_t);
+extern void rocksdb_options_set_bloom_locality(
+    rocksdb_options_t*, uint32_t);
+extern void rocksdb_options_set_allow_thread_local(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_inplace_update_support(
+    rocksdb_options_t*, unsigned char);
+extern void rocksdb_options_set_inplace_update_num_locks(
+    rocksdb_options_t*, size_t);
+
+enum {
+  rocksdb_no_compression = 0,
+  rocksdb_snappy_compression = 1,
+  rocksdb_zlib_compression = 2,
+  rocksdb_bz2_compression = 3,
+  rocksdb_lz4_compression = 4,
+  rocksdb_lz4hc_compression = 5
+};
+extern void rocksdb_options_set_compression(rocksdb_options_t*, int);
+
+enum {
+  rocksdb_level_compaction = 0,
+  rocksdb_universal_compaction = 1
+};
+extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int);
+extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
+/* Comparator */
+
+extern rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state,
+    void (*destructor)(void*),
+    int (*compare)(
+        void*,
+        const char* a, size_t alen,
+        const char* b, size_t blen),
+    const char* (*name)(void*));
+extern void rocksdb_comparator_destroy(rocksdb_comparator_t*);
+
+/* Filter policy */
+
+extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    void (*delete_filter)(
+        void*,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*));
+extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*);
+
+extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(
+    int bits_per_key);
+
+/* Merge Operator */
+
+extern rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*full_merge)(
+        void*,
+        const char* key, size_t key_length,
+        const char* existing_value, size_t existing_value_length,
+        const char* const* operands_list, const size_t* operands_list_length,
+        int num_operands,
+        unsigned char* success, size_t* new_value_length),
+    char* (*partial_merge)(
+        void*,
+        const char* key, size_t key_length,
+        const char* const* operands_list, const size_t* operands_list_length,
+        int num_operands,
+        unsigned char* success, size_t* new_value_length),
+    void (*delete_value)(
+        void*,
+        const char* value, size_t value_length),
+    const char* (*name)(void*));
+extern void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t*);
+
+/* Read options */
+
+extern rocksdb_readoptions_t* rocksdb_readoptions_create();
+extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*);
+extern void rocksdb_readoptions_set_verify_checksums(
+    rocksdb_readoptions_t*,
+    unsigned char);
+extern void rocksdb_readoptions_set_fill_cache(
+    rocksdb_readoptions_t*, unsigned char);
+extern void rocksdb_readoptions_set_snapshot(
+    rocksdb_readoptions_t*,
+    const rocksdb_snapshot_t*);
+extern void rocksdb_readoptions_set_read_tier(
+    rocksdb_readoptions_t*, int);
+extern void rocksdb_readoptions_set_tailing(
+    rocksdb_readoptions_t*, unsigned char);
+
+/* Write options */
+
+extern rocksdb_writeoptions_t* rocksdb_writeoptions_create();
+extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*);
+extern void rocksdb_writeoptions_set_sync(
+    rocksdb_writeoptions_t*, unsigned char);
+extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable);
+
+/* Flush options */
+
+extern rocksdb_flushoptions_t* rocksdb_flushoptions_create();
+extern void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t*);
+extern void rocksdb_flushoptions_set_wait(
+    rocksdb_flushoptions_t*, unsigned char);
+
+/* Cache */
+
+extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity);
+extern void rocksdb_cache_destroy(rocksdb_cache_t* cache);
+
+/* Env */
+
+extern rocksdb_env_t* rocksdb_create_default_env();
+extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n);
+extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
+extern void rocksdb_env_destroy(rocksdb_env_t*);
+
+/* SliceTransform */
+
+extern rocksdb_slicetransform_t* rocksdb_slicetransform_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*transform)(
+        void*,
+        const char* key, size_t length,
+        size_t* dst_length),
+    unsigned char (*in_domain)(
+        void*,
+        const char* key, size_t length),
+    unsigned char (*in_range)(
+        void*,
+        const char* key, size_t length),
+    const char* (*name)(void*));
+extern rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t);
+extern void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t*);
+
+/* Universal Compaction options */
+
+enum {
+  rocksdb_similar_size_compaction_stop_style = 0,
+  rocksdb_total_size_compaction_stop_style = 1
+};
+
+extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ;
+extern void rocksdb_universal_compaction_options_set_size_ratio(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_min_merge_width(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_max_merge_width(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_compression_size_percent(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_set_stop_style(
+  rocksdb_universal_compaction_options_t*, int);
+extern void rocksdb_universal_compaction_options_destroy(
+  rocksdb_universal_compaction_options_t*);
+
+extern int rocksdb_livefiles_count(
+  const rocksdb_livefiles_t*);
+extern const char* rocksdb_livefiles_name(
+  const rocksdb_livefiles_t*,
+  int index);
+extern int rocksdb_livefiles_level(
+  const rocksdb_livefiles_t*,
+  int index);
+extern size_t rocksdb_livefiles_size(
+  const rocksdb_livefiles_t*,
+  int index);
+extern const char* rocksdb_livefiles_smallestkey(
+  const rocksdb_livefiles_t*,
+  int index,
+  size_t* size);
+extern const char* rocksdb_livefiles_largestkey(
+  const rocksdb_livefiles_t*,
+  int index,
+  size_t* size);
+extern void rocksdb_livefiles_destroy(
+  const rocksdb_livefiles_t*);
+
+#ifdef __cplusplus
+}  /* end extern "C" */
+#endif
+
+#endif  /* STORAGE_ROCKSDB_INCLUDE_C_H_ */
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
+#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
+
+#include <memory>
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+using std::shared_ptr;
+
+class Cache;
+
+// Create a new cache with a fixed size capacity. The cache is sharded
+// to 2^numShardBits shards, by hash of the key. The total capacity
+// is divided and evenly assigned to each shard. Inside each shard,
+// the eviction is done in two passes: first try to free spaces by
+// evicting entries that are among the most least used removeScanCountLimit
+// entries and do not have reference other than by the cache itself, in
+// the least-used order. If not enough space is freed, further free the
+// entries in least used order.
+//
+// The functions without parameter numShardBits and/or removeScanCountLimit
+// use default values. removeScanCountLimit's default value is 0, which
+// means a strict LRU order inside each shard.
+extern shared_ptr<Cache> NewLRUCache(size_t capacity);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
+                                     int removeScanCountLimit);
+
+class Cache {
+ public:
+  Cache() { }
+
+  // Destroys all existing entries by calling the "deleter"
+  // function that was passed to the constructor.
+  virtual ~Cache();
+
+  // Opaque handle to an entry stored in the cache.
+  struct Handle { };
+
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity.
+  //
+  // Returns a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) = 0;
+
+  // If the cache has no mapping for "key", returns nullptr.
+  //
+  // Else return a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  virtual Handle* Lookup(const Slice& key) = 0;
+
+  // Release a mapping returned by a previous Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void Release(Handle* handle) = 0;
+
+  // Return the value encapsulated in a handle returned by a
+  // successful Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void* Value(Handle* handle) = 0;
+
+  // If the cache contains entry for key, erase it.  Note that the
+  // underlying entry will be kept around until all existing handles
+  // to it have been released.
+  virtual void Erase(const Slice& key) = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharing the same cache to partition the key space.  Typically the
+  // client will allocate a new id at startup and prepend the id to
+  // its cache keys.
+  virtual uint64_t NewId() = 0;
+
+  // returns the maximum configured capacity of the cache
+  virtual size_t GetCapacity() const = 0;
+
+  // returns the memory size for the entries residing in the cache.
+  virtual size_t GetUsage() const = 0;
+
+  // Call this on shutdown if you want to speed it up. Cache will disown
+  // any underlying data and will not free it on delete. This call will leak
+  // memory - call this only if you're shutting down the process.
+  // Any attempts of using cache after this call will fail terribly.
+  // Always delete the DB object before calling this method!
+  virtual void DisownData() {
+    // default implementation is noop
+  };
+
+  // Apply callback to all entries in the cache
+  // If thread_safe is true, it will also lock the accesses. Otherwise, it will
+  // access the cache without the lock held
+  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                      bool thread_safe) = 0;
+
+ private:
+  void LRU_Remove(Handle* e);
+  void LRU_Append(Handle* e);
+  void Unref(Handle* e);
+
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  Cache(const Cache&);
+  void operator=(const Cache&);
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_UTIL_CACHE_H_
--- a/include/rocksdb/compaction_filter.h
+++ b/include/rocksdb/compaction_filter.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+class Slice;
+class SliceTransform;
+
+// Context information of a compaction run
+struct CompactionFilterContext {
+  // Does this compaction run include all data files
+  bool is_full_compaction;
+  // Is this compaction requested by the client (true),
+  // or is it occurring as an automatic compaction process
+  bool is_manual_compaction;
+};
+
+// CompactionFilter allows an application to modify/delete a key-value at
+// the time of compaction.
+
+class CompactionFilter {
+ public:
+  // Context information of a compaction run
+  struct Context {
+    // Does this compaction run include all data files
+    bool is_full_compaction;
+    // Is this compaction requested by the client (true),
+    // or is it occurring as an automatic compaction process
+    bool is_manual_compaction;
+  };
+
+  virtual ~CompactionFilter() {}
+
+  // The compaction process invokes this
+  // method for kv that is being compacted. A return value
+  // of false indicates that the kv should be preserved in the
+  // output of this compaction run and a return value of true
+  // indicates that this key-value should be removed from the
+  // output of the compaction.  The application can inspect
+  // the existing value of the key and make decision based on it.
+  //
+  // When the value is to be preserved, the application has the option
+  // to modify the existing_value and pass it back through new_value.
+  // value_changed needs to be set to true in this case.
+  //
+  // If multithreaded compaction is being used *and* a single CompactionFilter
+  // instance was supplied via Options::compaction_filter, this method may be
+  // called from different threads concurrently.  The application must ensure
+  // that the call is thread-safe.
+  //
+  // If the CompactionFilter was created by a factory, then it will only ever
+  // be used by a single thread that is doing the compaction run, and this
+  // call does not need to be thread-safe.  However, multiple filters may be
+  // in existence and operating concurrently.
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const = 0;
+
+  // Returns a name that identifies this compaction filter.
+  // The name will be printed to LOG file on start up for diagnosis.
+  virtual const char* Name() const = 0;
+};
+
+// CompactionFilterV2 that buffers kv pairs sharing the same prefix and let
+// application layer to make individual decisions for all the kv pairs in the
+// buffer.
+class CompactionFilterV2 {
+ public:
+  virtual ~CompactionFilterV2() {}
+
+  // The compaction process invokes this method for all the kv pairs
+  // sharing the same prefix. It is a "roll-up" version of CompactionFilter.
+  //
+  // Each entry in the return vector indicates if the corresponding kv should
+  // be preserved in the output of this compaction run. The application can
+  // inspect the exisitng values of the keys and make decision based on it.
+  //
+  // When a value is to be preserved, the application has the option
+  // to modify the entry in existing_values and pass it back through an entry
+  // in new_values. A corresponding values_changed entry needs to be set to
+  // true in this case. Note that the new_values vector contains only changed
+  // values, i.e. new_values.size() <= values_changed.size().
+  //
+  typedef std::vector<Slice> SliceVector;
+  virtual std::vector<bool> Filter(int level,
+                                   const SliceVector& keys,
+                                   const SliceVector& existing_values,
+                                   std::vector<std::string>* new_values,
+                                   std::vector<bool>* values_changed)
+    const = 0;
+
+  // Returns a name that identifies this compaction filter.
+  // The name will be printed to LOG file on start up for diagnosis.
+  virtual const char* Name() const = 0;
+};
+
+// Each compaction will create a new CompactionFilter allowing the
+// application to know about different campactions
+class CompactionFilterFactory {
+ public:
+  virtual ~CompactionFilterFactory() { }
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) = 0;
+
+  // Returns a name that identifies this compaction filter factory.
+  virtual const char* Name() const = 0;
+};
+
+// Default implementaion of CompactionFilterFactory which does not
+// return any filter
+class DefaultCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(nullptr);
+  }
+
+  virtual const char* Name() const override {
+    return "DefaultCompactionFilterFactory";
+  }
+};
+
+// Each compaction will create a new CompactionFilterV2
+//
+// CompactionFilterFactoryV2 enables application to specify a prefix and use
+// CompactionFilterV2 to filter kv-pairs in batches. Each batch contains all
+// the kv-pairs sharing the same prefix.
+//
+// This is useful for applications that require grouping kv-pairs in
+// compaction filter to make a purge/no-purge decision. For example, if the
+// key prefix is user id and the rest of key represents the type of value.
+// This batching filter will come in handy if the application's compaction
+// filter requires knowledge of all types of values for any user id.
+//
+class CompactionFilterFactoryV2 {
+ public:
+  // NOTE: CompactionFilterFactoryV2 will not delete prefix_extractor
+  explicit CompactionFilterFactoryV2(const SliceTransform* prefix_extractor)
+    : prefix_extractor_(prefix_extractor) { }
+
+  virtual ~CompactionFilterFactoryV2() { }
+
+  virtual std::unique_ptr<CompactionFilterV2> CreateCompactionFilterV2(
+    const CompactionFilterContext& context) = 0;
+
+  // Returns a name that identifies this compaction filter factory.
+  virtual const char* Name() const = 0;
+
+  const SliceTransform* GetPrefixExtractor() const {
+    return prefix_extractor_;
+  }
+
+  void SetPrefixExtractor(const SliceTransform* prefix_extractor) {
+    prefix_extractor_ = prefix_extractor;
+  }
+
+ private:
+  // Prefix extractor for compaction filter v2
+  // Keys sharing the same prefix will be buffered internally.
+  // Client can implement a Filter callback function to operate on the buffer
+  const SliceTransform* prefix_extractor_;
+};
+
+// Default implementaion of CompactionFilterFactoryV2 which does not
+// return any filter
+class DefaultCompactionFilterFactoryV2 : public CompactionFilterFactoryV2 {
+ public:
+  explicit DefaultCompactionFilterFactoryV2()
+      : CompactionFilterFactoryV2(nullptr) { }
+
+  virtual std::unique_ptr<CompactionFilterV2>
+  CreateCompactionFilterV2(
+      const CompactionFilterContext& context) override {
+    return std::unique_ptr<CompactionFilterV2>(nullptr);
+  }
+
+  virtual const char* Name() const override {
+    return "DefaultCompactionFilterFactoryV2";
+  }
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database.  A Comparator implementation
+// must be thread-safe since rocksdb may invoke its methods concurrently
+// from multiple threads.
+class Comparator {
+ public:
+  virtual ~Comparator();
+
+  // Three-way comparison.  Returns value:
+  //   < 0 iff "a" < "b",
+  //   == 0 iff "a" == "b",
+  //   > 0 iff "a" > "b"
+  virtual int Compare(const Slice& a, const Slice& b) const = 0;
+
+  // The name of the comparator.  Used to check for comparator
+  // mismatches (i.e., a DB created with one comparator is
+  // accessed using a different comparator.
+  //
+  // The client of this package should switch to a new name whenever
+  // the comparator implementation changes in a way that will cause
+  // the relative ordering of any two keys to change.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Advanced functions: these are used to reduce the space requirements
+  // for internal data structures like index blocks.
+
+  // If *start < limit, changes *start to a short string in [start,limit).
+  // Simple comparator implementations may return with *start unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const = 0;
+
+  // Changes *key to a short string >= *key.
+  // Simple comparator implementations may return with *key unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortSuccessor(std::string* key) const = 0;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering.  The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -0,0 +1,495 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
+#define STORAGE_ROCKSDB_INCLUDE_DB_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <memory>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include "rocksdb/version.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+
+namespace rocksdb {
+
+using std::unique_ptr;
+
+class ColumnFamilyHandle {
+ public:
+  virtual ~ColumnFamilyHandle() {}
+};
+extern const std::string kDefaultColumnFamilyName;
+
+struct ColumnFamilyDescriptor {
+  std::string name;
+  ColumnFamilyOptions options;
+  ColumnFamilyDescriptor()
+      : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
+  ColumnFamilyDescriptor(const std::string& _name,
+                         const ColumnFamilyOptions& _options)
+      : name(_name), options(_options) {}
+};
+
+static const int kMajorVersion = __ROCKSDB_MAJOR__;
+static const int kMinorVersion = __ROCKSDB_MINOR__;
+
+struct Options;
+struct ReadOptions;
+struct WriteOptions;
+struct FlushOptions;
+struct TableProperties;
+class WriteBatch;
+class Env;
+
+// Metadata associated with each SST file.
+struct LiveFileMetaData {
+  std::string column_family_name;  // Name of the column family
+  std::string name;                // Name of the file
+  int level;               // Level at which this file resides.
+  size_t size;             // File size in bytes.
+  std::string smallestkey; // Smallest user defined key in the file.
+  std::string largestkey;  // Largest user defined key in the file.
+  SequenceNumber smallest_seqno; // smallest seqno in file
+  SequenceNumber largest_seqno;  // largest seqno in file
+};
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+class Snapshot {
+ protected:
+  virtual ~Snapshot();
+};
+
+// A range of keys
+struct Range {
+  Slice start;          // Included in the range
+  Slice limit;          // Not included in the range
+
+  Range() { }
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
+};
+
+// A collections of table properties objects, where
+//  key: is the table's file name.
+//  value: the table properties object of the given table.
+typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
+    TablePropertiesCollection;
+
+// A DB is a persistent ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
+class DB {
+ public:
+  // Open the database with the specified "name".
+  // Stores a pointer to a heap-allocated database in *dbptr and returns
+  // OK on success.
+  // Stores nullptr in *dbptr and returns a non-OK status on error.
+  // Caller should delete *dbptr when it is no longer needed.
+  static Status Open(const Options& options,
+                     const std::string& name,
+                     DB** dbptr);
+
+  // Open the database for read only. All DB interfaces
+  // that modify data, like put/delete, will return error.
+  // If the db is opened in read only mode, then no compactions
+  // will happen.
+  static Status OpenForReadOnly(const Options& options,
+      const std::string& name, DB** dbptr,
+      bool error_if_log_file_exist = false);
+
+  // Open the database for read only with column families. When opening DB with
+  // read only, you can specify only a subset of column families in the
+  // database that should be opened. However, you always need to specify default
+  // column family. The default column family name is 'default' and it's stored
+  // in rocksdb::kDefaultColumnFamilyName
+  static Status OpenForReadOnly(
+      const DBOptions& db_options, const std::string& name,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+      bool error_if_log_file_exist = false);
+
+  // Open DB with column families.
+  // db_options specify database specific options
+  // column_families is the vector of all column families in the databse,
+  // containing column family name and options. You need to open ALL column
+  // families in the database. To get the list of column families, you can use
+  // ListColumnFamilies(). Also, you can open only a subset of column families
+  // for read-only access.
+  // The default column family name is 'default' and it's stored
+  // in rocksdb::kDefaultColumnFamilyName.
+  // If everything is OK, handles will on return be the same size
+  // as column_families --- handles[i] will be a handle that you
+  // will use to operate on column family column_family[i]
+  static Status Open(const DBOptions& db_options, const std::string& name,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+  // ListColumnFamilies will open the DB specified by argument name
+  // and return the list of all column families in that DB
+  // through column_families argument. The ordering of
+  // column families in column_families is unspecified.
+  static Status ListColumnFamilies(const DBOptions& db_options,
+                                   const std::string& name,
+                                   std::vector<std::string>* column_families);
+
+  DB() { }
+  virtual ~DB();
+
+  // Create a column_family and return the handle of column family
+  // through the argument handle.
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle);
+
+  // Drop a column family specified by column_family handle. This call
+  // only records a drop record in the manifest and prevents the column
+  // family from flushing and compacting.
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
+
+  // Set the database entry for "key" to "value".
+  // Returns OK on success, and a non-OK status on error.
+  // Note: consider setting options.sync = true.
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) = 0;
+  virtual Status Put(const WriteOptions& options, const Slice& key,
+                     const Slice& value) {
+    return Put(options, DefaultColumnFamily(), key, value);
+  }
+
+  // Remove the database entry (if any) for "key".  Returns OK on
+  // success, and a non-OK status on error.  It is not an error if "key"
+  // did not exist in the database.
+  // Note: consider setting options.sync = true.
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) = 0;
+  virtual Status Delete(const WriteOptions& options, const Slice& key) {
+    return Delete(options, DefaultColumnFamily(), key);
+  }
+
+  // Merge the database entry for "key" with "value".  Returns OK on success,
+  // and a non-OK status on error. The semantics of this operation is
+  // determined by the user provided merge_operator when opening DB.
+  // Note: consider setting options.sync = true.
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) = 0;
+  virtual Status Merge(const WriteOptions& options, const Slice& key,
+                       const Slice& value) {
+    return Merge(options, DefaultColumnFamily(), key, value);
+  }
+
+  // Apply the specified updates to the database.
+  // Returns OK on success, non-OK on failure.
+  // Note: consider setting options.sync = true.
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+  // If the database contains an entry for "key" store the
+  // corresponding value in *value and return OK.
+  //
+  // If there is no entry for "key" leave *value unchanged and return
+  // a status for which Status::IsNotFound() returns true.
+  //
+  // May return some other Status on an error.
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) = 0;
+  virtual Status Get(const ReadOptions& options, const Slice& key, std::string* value) {
+    return Get(options, DefaultColumnFamily(), key, value);
+  }
+
+  // If keys[i] does not exist in the database, then the i'th returned
+  // status will be one for which Status::IsNotFound() is true, and
+  // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+  // the i'th returned status will have Status::ok() true, and (*values)[i]
+  // will store the value associated with keys[i].
+  //
+  // (*values) will always be resized to be the same size as (keys).
+  // Similarly, the number of returned statuses will be the number of keys.
+  // Note: keys will not be "de-duplicated". Duplicate keys will return
+  // duplicate values in order.
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) {
+    return MultiGet(options, std::vector<ColumnFamilyHandle*>(
+                                 keys.size(), DefaultColumnFamily()),
+                    keys, values);
+  }
+
+  // If the key definitely does not exist in the database, then this method
+  // returns false, else true. If the caller wants to obtain value when the key
+  // is found in memory, a bool for 'value_found' must be passed. 'value_found'
+  // will be true on return if value has been set properly.
+  // This check is potentially lighter-weight than invoking DB::Get(). One way
+  // to make this lighter weight is to avoid doing any IOs.
+  // Default implementation here returns true and sets 'value_found' to false
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value, bool* value_found = nullptr) {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true;
+  }
+  virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
+                           std::string* value, bool* value_found = nullptr) {
+    return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
+  }
+
+  // Return a heap-allocated iterator over the contents of the database.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  //
+  // Caller should delete the iterator when it is no longer needed.
+  // The returned iterator should be deleted before this db is deleted.
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) = 0;
+  virtual Iterator* NewIterator(const ReadOptions& options) {
+    return NewIterator(options, DefaultColumnFamily());
+  }
+  // Returns iterators from a consistent database state across multiple
+  // column families. Iterators are heap allocated and need to be deleted
+  // before the db is deleted
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) = 0;
+
+  // Return a handle to the current DB state.  Iterators created with
+  // this handle will all observe a stable snapshot of the current DB
+  // state.  The caller must call ReleaseSnapshot(result) when the
+  // snapshot is no longer needed.
+  //
+  // nullptr will be returned if the DB fails to take a snapshot or does
+  // not support snapshot.
+  virtual const Snapshot* GetSnapshot() = 0;
+
+  // Release a previously acquired snapshot.  The caller must not
+  // use "snapshot" after this call.
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+  // DB implementations can export properties about their state
+  // via this method.  If "property" is a valid property understood by this
+  // DB implementation, fills "*value" with its current value and returns
+  // true.  Otherwise returns false.
+  //
+  //
+  // Valid property names include:
+  //
+  //  "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
+  //     where <N> is an ASCII representation of a level number (e.g. "0").
+  //  "rocksdb.stats" - returns a multi-line string that describes statistics
+  //     about the internal operation of the DB.
+  //  "rocksdb.sstables" - returns a multi-line string that describes all
+  //     of the sstables that make up the db contents.
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) = 0;
+  virtual bool GetProperty(const Slice& property, std::string* value) {
+    return GetProperty(DefaultColumnFamily(), property, value);
+  }
+
+  // For each i in [0,n-1], store in "sizes[i]", the approximate
+  // file system space used by keys in "[range[i].start .. range[i].limit)".
+  //
+  // Note that the returned sizes measure file system space usage, so
+  // if the user data compresses by a factor of ten, the returned
+  // sizes will be one-tenth the size of the corresponding user data size.
+  //
+  // The results may not include the sizes of recently written data.
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                   const Range* range, int n,
+                                   uint64_t* sizes) = 0;
+  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) {
+    GetApproximateSizes(DefaultColumnFamily(), range, n, sizes);
+  }
+
+  // Compact the underlying storage for the key range [*begin,*end].
+  // The actual compaction interval might be superset of [*begin, *end].
+  // In particular, deleted and overwritten versions are discarded,
+  // and the data is rearranged to reduce the cost of operations
+  // needed to access the data.  This operation should typically only
+  // be invoked by users who understand the underlying implementation.
+  //
+  // begin==nullptr is treated as a key before all keys in the database.
+  // end==nullptr is treated as a key after all keys in the database.
+  // Therefore the following call will compact the entire database:
+  //    db->CompactRange(nullptr, nullptr);
+  // Note that after the entire database is compacted, all data are pushed
+  // down to the last level containing any data. If the total data size
+  // after compaction is reduced, that level might not be appropriate for
+  // hosting all the files. In this case, client could set reduce_level
+  // to true, to move the files back to the minimum level capable of holding
+  // the data set or a given level (specified by non-negative target_level).
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false,
+                              int target_level = -1) = 0;
+  virtual Status CompactRange(const Slice* begin, const Slice* end,
+                              bool reduce_level = false,
+                              int target_level = -1) {
+    return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
+                        target_level);
+  }
+
+  // Number of levels used for this DB.
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
+  virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
+  virtual int MaxMemCompactionLevel() {
+    return MaxMemCompactionLevel(DefaultColumnFamily());
+  }
+
+  // Number of files in level-0 that would stop writes.
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
+  virtual int Level0StopWriteTrigger() {
+    return Level0StopWriteTrigger(DefaultColumnFamily());
+  }
+
+  // Get DB name -- the exact same name that was provided as an argument to
+  // DB::Open()
+  virtual const std::string& GetName() const = 0;
+
+  // Get Env object from the DB
+  virtual Env* GetEnv() const = 0;
+
+  // Get DB Options that we use
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family)
+      const = 0;
+  virtual const Options& GetOptions() const {
+    return GetOptions(DefaultColumnFamily());
+  }
+
+  // Flush all mem-table data.
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) = 0;
+  virtual Status Flush(const FlushOptions& options) {
+    return Flush(options, DefaultColumnFamily());
+  }
+
+  // The sequence number of the most recent transaction.
+  virtual SequenceNumber GetLatestSequenceNumber() const = 0;
+
+#ifndef ROCKSDB_LITE
+
+  // Prevent file deletions. Compactions will continue to occur,
+  // but no obsolete files will be deleted. Calling this multiple
+  // times have the same effect as calling it once.
+  virtual Status DisableFileDeletions() = 0;
+
+  // Allow compactions to delete obsolete files.
+  // If force == true, the call to EnableFileDeletions() will guarantee that
+  // file deletions are enabled after the call, even if DisableFileDeletions()
+  // was called multiple times before.
+  // If force == false, EnableFileDeletions will only enable file deletion
+  // after it's been called at least as many times as DisableFileDeletions(),
+  // enabling the two methods to be called by two threads concurrently without
+  // synchronization -- i.e., file deletions will be enabled only after both
+  // threads call EnableFileDeletions()
+  virtual Status EnableFileDeletions(bool force = true) = 0;
+
+  // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
+
+  // THIS METHOD IS DEPRECATED. Use the GetLiveFilesMetaData to get more
+  // detailed information on the live files.
+  // Retrieve the list of all files in the database. The files are
+  // relative to the dbname and are not absolute paths. The valid size of the
+  // manifest file is returned in manifest_file_size. The manifest file is an
+  // ever growing file, but only the portion specified by manifest_file_size is
+  // valid for this snapshot.
+  // Setting flush_memtable to true does Flush before recording the live files.
+  // Setting flush_memtable to false is useful when we don't want to wait for
+  // flush which may have to wait for compaction to complete taking an
+  // indeterminate time.
+  //
+  // In case you have multiple column families, even if flush_memtable is true,
+  // you still need to call GetSortedWalFiles after GetLiveFiles to compensate
+  // for new data that arrived to already-flushed column families while other
+  // column families were flushing
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) = 0;
+
+  // Retrieve the sorted list of all wal files with earliest file first
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
+
+  // Sets iter to an iterator that is positioned at a write-batch containing
+  // seq_number. If the sequence number is non existent, it returns an iterator
+  // at the first available seq_no after the requested seq_no
+  // Returns Status::OK if iterator is valid
+  // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+  // use this api, else the WAL files will get
+  // cleared aggressively and the iterator might keep getting invalid before
+  // an update is read.
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions&
+          read_options = TransactionLogIterator::ReadOptions()) = 0;
+
+  // Delete the file name from the db directory and update the internal state to
+  // reflect that. Supports deletion of sst and log files only. 'name' must be
+  // path relative to the db directory. eg. 000001.sst, /archive/000003.log
+  virtual Status DeleteFile(std::string name) = 0;
+
+  // Returns a list of all table files with their level, start key
+  // and end key
+  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {}
+
+#endif  // ROCKSDB_LITE
+
+  // Sets the globally unique ID created at database creation time by invoking
+  // Env::GenerateUniqueId(), in identity. Returns Status::OK if identity could
+  // be set properly
+  virtual Status GetDbIdentity(std::string& identity) = 0;
+
+  // Returns default column family handle
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
+
+#ifndef ROCKSDB_LITE
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                          TablePropertiesCollection* props) = 0;
+  virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+    return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
+  }
+#endif  // ROCKSDB_LITE
+
+ private:
+  // No copying allowed
+  DB(const DB&);
+  void operator=(const DB&);
+};
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options);
+
+#ifndef ROCKSDB_LITE
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+Status RepairDB(const std::string& dbname, const Options& options);
+#endif
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_DB_H_
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -0,0 +1,772 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc.  Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
+#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
+
+#include <cstdarg>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class FileLock;
+class Logger;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+class WritableFile;
+class RandomRWFile;
+class Directory;
+struct DBOptions;
+
+using std::unique_ptr;
+using std::shared_ptr;
+
+
+// Options while opening a file to read/write
+struct EnvOptions {
+
+  // construct with default Options
+  EnvOptions();
+
+  // construct from Options
+  explicit EnvOptions(const DBOptions& options);
+
+  // If true, then allow caching of data in environment buffers
+  bool use_os_buffer = true;
+
+   // If true, then use mmap to read data
+  bool use_mmap_reads = false;
+
+   // If true, then use mmap to write data
+  bool use_mmap_writes = true;
+
+  // If true, set the FD_CLOEXEC on open fd.
+  bool set_fd_cloexec = true;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, in the background. Issue one request for every bytes_per_sync
+  // written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync = 0;
+
+  // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
+  // means that file size won't change as part of preallocation.
+  // If false, preallocation will also change the file size. This option will
+  // improve the performance in workloads where you sync the data on every
+  // write. By default, we set it to true for MANIFEST writes and false for
+  // WAL writes
+  bool fallocate_with_keep_size = true;
+};
+
+class Env {
+ public:
+  Env() { }
+  virtual ~Env();
+
+  // Return a default environment suitable for the current operating
+  // system.  Sophisticated users may wish to provide their own Env
+  // implementation instead of relying on this default environment.
+  //
+  // The result of Default() belongs to rocksdb and must never be deleted.
+  static Env* Default();
+
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options)
+                                   = 0;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options)
+                                     = 0;
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Create an object that both reads and writes to a file on
+  // specified offsets (random access). If file already exists,
+  // does not overwrite it. On success, stores a pointer to the
+  // new file in *result and returns OK. On failure stores nullptr
+  // in *result and returns non-OK.
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Create an object that represents a directory. Will fail if directory
+  // doesn't exist. If the directory exists, it will open the directory
+  // and create a new Directory object.
+  //
+  // On success, stores a pointer to the new Directory in
+  // *result and returns OK. On failure stores nullptr in *result and
+  // returns non-OK.
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) = 0;
+
+  // Returns true iff the named file exists.
+  virtual bool FileExists(const std::string& fname) = 0;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir".
+  // Original contents of *results are dropped.
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) = 0;
+
+  // Delete the named file.
+  virtual Status DeleteFile(const std::string& fname) = 0;
+
+  // Create the specified directory. Returns error if directory exists.
+  virtual Status CreateDir(const std::string& dirname) = 0;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
+
+  // Delete the specified directory.
+  virtual Status DeleteDir(const std::string& dirname) = 0;
+
+  // Store the size of fname in *file_size.
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+  // Store the last modification time of fname in *file_mtime.
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) = 0;
+  // Rename file src to target.
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) = 0;
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  virtual Status UnlockFile(FileLock* lock) = 0;
+
+  enum Priority { LOW, HIGH, TOTAL };
+
+  // Arrange to run "(*function)(arg)" once in a background thread, in
+  // the thread pool specified by pri. By default, jobs go to the 'LOW'
+  // priority thread pool.
+
+  // "function" may run in an unspecified thread.  Multiple functions
+  // added to the same Env may run concurrently in different threads.
+  // I.e., the caller may not assume that background work items are
+  // serialized.
+  virtual void Schedule(
+      void (*function)(void* arg),
+      void* arg,
+      Priority pri = LOW) = 0;
+
+  // Start a new thread, invoking "function(arg)" within the new thread.
+  // When "function(arg)" returns, the thread will be destroyed.
+  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+  // Wait for all threads started by StartThread to terminate.
+  virtual void WaitForJoin() {}
+
+  // Get thread pool queue length for specific thrad pool.
+  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
+    return 0;
+  }
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or many not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  virtual Status GetTestDirectory(std::string* path) = 0;
+
+  // Create and return a log file for storing informational messages.
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) = 0;
+
+  // Returns the number of micro-seconds since some fixed point in time. Only
+  // useful for computing deltas of time.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros
+  virtual uint64_t NowNanos() {
+    return NowMicros() * 1000;
+  }
+
+  // Sleep/delay the thread for the perscribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the current host name.
+  virtual Status GetHostName(char* name, uint64_t len) = 0;
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Get full directory name for this db.
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) = 0;
+
+  // The number of background worker threads of a specific thread pool
+  // for this environment. 'LOW' is the default pool.
+  // default number: 1
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+
+  // Generates a unique id that can be used to identify a db
+  virtual std::string GenerateUniqueId();
+
+  // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
+  // the EnvOptions in the parameters, but is optimized for writing log files.
+  // Default implementation returns the copy of the same object.
+  virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const;
+  // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
+  // of the EnvOptions in the parameters, but is optimized for writing manifest
+  // files. Default implementation returns the copy of the same object.
+  virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options)
+      const;
+
+ private:
+  // No copying allowed
+  Env(const Env&);
+  void operator=(const Env&);
+};
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+  SequentialFile() { }
+  virtual ~SequentialFile();
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Skip(uint64_t n) = 0;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+  RandomAccessFile() { }
+  virtual ~RandomAccessFile();
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to eachother by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return 0; // Default implementation to prevent issues with backwards
+              // compatibility.
+  };
+
+
+  enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+  virtual void Hint(AccessPattern pattern) {}
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+  WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
+  }
+  virtual ~WritableFile();
+
+  virtual Status Append(const Slice& data) = 0;
+  virtual Status Close() = 0;
+  virtual Status Flush() = 0;
+  virtual Status Sync() = 0; // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() {
+    return Sync();
+  }
+
+  /*
+   * Get the size of valid data in the file.
+   */
+  virtual uint64_t GetFileSize() {
+    return 0;
+  }
+
+  /*
+   * Get and set the default pre-allocation block size for writes to
+   * this file.  If non-zero, then Allocate will be used to extend the
+   * underlying storage of a file (generally via fallocate) if the Env
+   * instance supports it.
+   */
+  void SetPreallocationBlockSize(size_t size) {
+    preallocation_block_size_ = size;
+  }
+
+  virtual void GetPreallocationStatus(size_t* block_size,
+                                      size_t* last_allocated_block) {
+    *last_allocated_block = last_preallocated_block_;
+    *block_size = preallocation_block_size_;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return 0; // Default implementation to prevent issues with backwards
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+
+ protected:
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  void PrepareWrite(size_t offset, size_t len) {
+    if (preallocation_block_size_ == 0) {
+      return;
+    }
+    // If this write would cross one or more preallocation blocks,
+    // determine what the last preallocation block necesessary to
+    // cover this write would be and Allocate to that point.
+    const auto block_size = preallocation_block_size_;
+    size_t new_last_preallocated_block =
+      (offset + len + block_size - 1) / block_size;
+    if (new_last_preallocated_block > last_preallocated_block_) {
+      size_t num_spanned_blocks =
+        new_last_preallocated_block - last_preallocated_block_;
+      Allocate(block_size * last_preallocated_block_,
+               block_size * num_spanned_blocks);
+      last_preallocated_block_ = new_last_preallocated_block;
+    }
+  }
+
+  /*
+   * Pre-allocate space for a file.
+   */
+  virtual Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual Status RangeSync(off_t offset, off_t nbytes) {
+    return Status::OK();
+  }
+
+ private:
+  size_t last_preallocated_block_;
+  size_t preallocation_block_size_;
+  // No copying allowed
+  WritableFile(const WritableFile&);
+  void operator=(const WritableFile&);
+};
+
+// A file abstraction for random reading and writing.
+class RandomRWFile {
+ public:
+  RandomRWFile() {}
+  virtual ~RandomRWFile() {}
+
+  // Write data from Slice data to file starting from offset
+  // Returns IOError on failure, but does not guarantee
+  // atomicity of a write.  Returns OK status on success.
+  //
+  // Safe for concurrent use.
+  virtual Status Write(uint64_t offset, const Slice& data) = 0;
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+  virtual Status Close() = 0; // closes the file
+  virtual Status Sync() = 0; // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() {
+    return Sync();
+  }
+
+  /*
+   * Pre-allocate space for a file.
+   */
+  virtual Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+
+ private:
+  // No copying allowed
+  RandomRWFile(const RandomRWFile&);
+  void operator=(const RandomRWFile&);
+};
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class Directory {
+ public:
+  virtual ~Directory() {}
+  // Fsync directory
+  virtual Status Fsync() = 0;
+};
+
+enum InfoLogLevel : unsigned char {
+  DEBUG_LEVEL = 0,
+  INFO_LEVEL,
+  WARN_LEVEL,
+  ERROR_LEVEL,
+  FATAL_LEVEL,
+  NUM_INFO_LOG_LEVELS,
+};
+
+// An interface for writing log messages.
+class Logger {
+ public:
+  enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
+  explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+      : log_level_(log_level) {}
+  virtual ~Logger();
+
+  // Write an entry to the log file with the specified format.
+  virtual void Logv(const char* format, va_list ap) = 0;
+
+  // Write an entry to the log file with the specified log level
+  // and format.  Any log with level under the internal log level
+  // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
+  // printed.
+  void Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
+    static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN",
+                                                "ERROR", "FATAL"};
+    if (log_level < log_level_) {
+      return;
+    }
+
+    if (log_level == InfoLogLevel::INFO_LEVEL) {
+      // Doesn't print log level if it is INFO level.
+      // This is to avoid unexpected performance regression after we add
+      // the feature of log level. All the logs before we add the feature
+      // are INFO level. We don't want to add extra costs to those existing
+      // logging.
+      Logv(format, ap);
+    } else {
+      char new_format[500];
+      snprintf(new_format, sizeof(new_format) - 1, "[%s] %s",
+               kInfoLogLevelNames[log_level], format);
+      Logv(new_format, ap);
+    }
+  }
+  virtual size_t GetLogFileSize() const {
+    return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
+  }
+  // Flush to the OS buffers
+  virtual void Flush() {}
+  virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
+  virtual void SetInfoLogLevel(const InfoLogLevel log_level) {
+    log_level_ = log_level;
+  }
+
+ private:
+  // No copying allowed
+  Logger(const Logger&);
+  void operator=(const Logger&);
+  InfoLogLevel log_level_;
+};
+
+
+// Identifies a locked file.
+class FileLock {
+ public:
+  FileLock() { }
+  virtual ~FileLock();
+ private:
+  // No copying allowed
+  FileLock(const FileLock&);
+  void operator=(const FileLock&);
+};
+
+extern void LogFlush(const shared_ptr<Logger>& info_log);
+
+extern void Log(const InfoLogLevel log_level,
+                const shared_ptr<Logger>& info_log, const char* format, ...);
+
+// a set of log functions with different log levels.
+extern void Debug(const shared_ptr<Logger>& info_log, const char* format, ...);
+extern void Info(const shared_ptr<Logger>& info_log, const char* format, ...);
+extern void Warn(const shared_ptr<Logger>& info_log, const char* format, ...);
+extern void Error(const shared_ptr<Logger>& info_log, const char* format, ...);
+extern void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...);
+
+// Log the specified data to *info_log if info_log is non-nullptr.
+// The default info log level is InfoLogLevel::ERROR.
+extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 2, 3)))
+#   endif
+    ;
+
+extern void LogFlush(Logger *info_log);
+
+extern void Log(const InfoLogLevel log_level, Logger* info_log,
+                const char* format, ...);
+
+// The default info log level is InfoLogLevel::ERROR.
+extern void Log(Logger* info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 2, 3)))
+#   endif
+    ;
+
+// a set of log functions with different log levels.
+extern void Debug(Logger* info_log, const char* format, ...);
+extern void Info(Logger* info_log, const char* format, ...);
+extern void Warn(Logger* info_log, const char* format, ...);
+extern void Error(Logger* info_log, const char* format, ...);
+extern void Fatal(Logger* info_log, const char* format, ...);
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+                                const std::string& fname,
+                                bool should_sync = false);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+                               std::string* data);
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit EnvWrapper(Env* t) : target_(t) { }
+  virtual ~EnvWrapper();
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_; }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status NewSequentialFile(const std::string& f,
+                           unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) {
+    return target_->NewSequentialFile(f, r, options);
+  }
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) {
+    return target_->NewRandomAccessFile(f, r, options);
+  }
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) {
+    return target_->NewWritableFile(f, r, options);
+  }
+  Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
+                         const EnvOptions& options) {
+    return target_->NewRandomRWFile(f, r, options);
+  }
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) {
+    return target_->NewDirectory(name, result);
+  }
+  bool FileExists(const std::string& f) { return target_->FileExists(f); }
+  Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
+    return target_->GetChildren(dir, r);
+  }
+  Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
+  Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
+  Status CreateDirIfMissing(const std::string& d) {
+    return target_->CreateDirIfMissing(d);
+  }
+  Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
+  Status GetFileSize(const std::string& f, uint64_t* s) {
+    return target_->GetFileSize(f, s);
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) {
+    return target_->GetFileModificationTime(fname, file_mtime);
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) {
+    return target_->RenameFile(s, t);
+  }
+  Status LockFile(const std::string& f, FileLock** l) {
+    return target_->LockFile(f, l);
+  }
+  Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
+  void Schedule(void (*f)(void*), void* a, Priority pri) {
+    return target_->Schedule(f, a, pri);
+  }
+  void StartThread(void (*f)(void*), void* a) {
+    return target_->StartThread(f, a);
+  }
+  void WaitForJoin() { return target_->WaitForJoin(); }
+  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
+    return target_->GetThreadPoolQueueLen(pri);
+  }
+  virtual Status GetTestDirectory(std::string* path) {
+    return target_->GetTestDirectory(path);
+  }
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) {
+    return target_->NewLogger(fname, result);
+  }
+  uint64_t NowMicros() {
+    return target_->NowMicros();
+  }
+  void SleepForMicroseconds(int micros) {
+    target_->SleepForMicroseconds(micros);
+  }
+  Status GetHostName(char* name, uint64_t len) {
+    return target_->GetHostName(name, len);
+  }
+  Status GetCurrentTime(int64_t* unix_time) {
+    return target_->GetCurrentTime(unix_time);
+  }
+  Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) {
+    return target_->GetAbsolutePath(db_path, output_path);
+  }
+  void SetBackgroundThreads(int num, Priority pri) {
+    return target_->SetBackgroundThreads(num, pri);
+  }
+  std::string TimeToString(uint64_t time) {
+    return target_->TimeToString(time);
+  }
+
+ private:
+  Env* target_;
+};
+
+// Returns a new environment that stores its data in memory and delegates
+// all non-file-storage tasks to base_env. The caller must delete the result
+// when it is no longer needed.
+// *base_env must remain live while the result is in use.
+Env* NewMemEnv(Env* base_env);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ENV_H_
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys.  These filters are stored in rocksdb and are consulted
+// automatically by rocksdb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+class FilterPolicy {
+ public:
+  virtual ~FilterPolicy();
+
+  // Return the name of this policy.  Note that if the filter encoding
+  // changes in an incompatible way, the name returned by this method
+  // must be changed.  Otherwise, old incompatible filters may be
+  // passed to methods of this type.
+  virtual const char* Name() const = 0;
+
+  // keys[0,n-1] contains a list of keys (potentially with duplicates)
+  // that are ordered according to the user supplied comparator.
+  // Append a filter that summarizes keys[0,n-1] to *dst.
+  //
+  // Warning: do not change the initial contents of *dst.  Instead,
+  // append the newly constructed filter to *dst.
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
+      const = 0;
+
+  // "filter" contains the data appended by a preceding call to
+  // CreateFilter() on this class.  This method must return true if
+  // the key was in the list of keys passed to CreateFilter().
+  // This method may return true or false if the key was not on the
+  // list, but it should aim to return false with a high probability.
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key.  A good value for bits_per_key
+// is 10, which yields a filter with ~ 1% false positive rate.
+//
+// Callers must delete the result after any database that is using the
+// result has been closed.
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys.  For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
+
+}
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
--- a/include/rocksdb/flush_block_policy.h
+++ b/include/rocksdb/flush_block_policy.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+class BlockBuilder;
+struct Options;
+
+// FlushBlockPolicy provides a configurable way to determine when to flush a
+// block in the block based tables,
+class FlushBlockPolicy {
+ public:
+  // Keep track of the key/value sequences and return the boolean value to
+  // determine if table builder should flush current data block.
+  virtual bool Update(const Slice& key,
+                      const Slice& value) = 0;
+
+  virtual ~FlushBlockPolicy() { }
+};
+
+class FlushBlockPolicyFactory {
+ public:
+  // Return the name of the flush block policy.
+  virtual const char* Name() const = 0;
+
+  // Return a new block flush policy that flushes data blocks by data size.
+  // FlushBlockPolicy may need to access the metadata of the data block
+  // builder to determine when to flush the blocks.
+  //
+  // Callers must delete the result after any database that is using the
+  // result has been closed.
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const Options& options, const BlockBuilder& data_block_builder) const = 0;
+
+  virtual ~FlushBlockPolicyFactory() { }
+};
+
+class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  FlushBlockBySizePolicyFactory() {}
+
+  virtual const char* Name() const override {
+    return "FlushBlockBySizePolicyFactory";
+  }
+
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const Options& options,
+      const BlockBuilder& data_block_builder) const override;
+};
+
+}  // rocksdb
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface.  Multiple implementations
+// are provided by this library.  In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Iterator {
+ public:
+  Iterator();
+  virtual ~Iterator();
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the last key in the source.  The iterator is
+  // Valid() after this call iff the source is not empty.
+  virtual void SeekToLast() = 0;
+
+  // Position at the first key in the source that at or past target
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  virtual void Seek(const Slice& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Moves to the previous entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the first entry in source.
+  // REQUIRES: Valid()
+  virtual void Prev() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual Slice key() const = 0;
+
+  // Return the value for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: !AtEnd() && !AtStart()
+  virtual Slice value() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  // If non-blocking IO is requested and this operation cannot be
+  // satisfied without doing some IO, then this returns Status::Incomplete().
+  virtual Status status() const = 0;
+
+  // Clients are allowed to register function/arg1/arg2 triples that
+  // will be invoked when this iterator is destroyed.
+  //
+  // Note that unlike all of the preceding methods, this method is
+  // not abstract and therefore clients should not override it.
+  typedef void (*CleanupFunction)(void* arg1, void* arg2);
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+  struct Cleanup {
+    CleanupFunction function;
+    void* arg1;
+    void* arg2;
+    Cleanup* next;
+  };
+  Cleanup cleanup_;
+
+  // No copying allowed
+  Iterator(const Iterator&);
+  void operator=(const Iterator&);
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
--- a/include/rocksdb/ldb_tool.h
+++ b/include/rocksdb/ldb_tool.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#ifndef ROCKSDB_LITE
+#pragma once
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+class LDBTool {
+ public:
+  void Run(int argc, char** argv, Options = Options());
+};
+
+} // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -0,0 +1,284 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file contains the interface that must be implemented by any collection
+// to be used as the backing store for a MemTable. Such a collection must
+// satisfy the following properties:
+//  (1) It does not store duplicate items.
+//  (2) It uses MemTableRep::KeyComparator to compare items for iteration and
+//     equality.
+//  (3) It can be accessed concurrently by multiple readers and can support
+//     during reads. However, it needn't support multiple concurrent writes.
+//  (4) Items are never deleted.
+// The liberal use of assertions is encouraged to enforce (1).
+//
+// The factory will be passed an Arena object when a new MemTableRep is
+// requested. The API for this object is in rocksdb/arena.h.
+//
+// Users can implement their own memtable representations. We include three
+// types built in:
+//  - SkipListRep: This is the default; it is backed by a skip list.
+//  - HashSkipListRep: The memtable rep that is best used for keys that are
+//  structured like "prefix:suffix" where iteration within a prefix is
+//  common and iteration across different prefixes is rare. It is backed by
+//  a hash map where each bucket is a skip list.
+//  - VectorRep: This is backed by an unordered std::vector. On iteration, the
+// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
+// has been called, the vector will only be sorted once. It is optimized for
+// random-write-heavy workloads.
+//
+// The last four implementations are designed for situations in which
+// iteration over the entire collection is rare since doing so requires all the
+// keys to be copied into a sorted data structure.
+
+#pragma once
+
+#include <memory>
+#include <stdint.h>
+
+namespace rocksdb {
+
+class Arena;
+class LookupKey;
+class Slice;
+class SliceTransform;
+class Logger;
+
+typedef void* KeyHandle;
+
+class MemTableRep {
+ public:
+  // KeyComparator provides a means to compare keys, which are internal keys
+  // concatenated with values.
+  class KeyComparator {
+   public:
+    // Compare a and b. Return a negative value if a is less than b, 0 if they
+    // are equal, and a positive value if a is greater than b
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const = 0;
+
+    virtual int operator()(const char* prefix_len_key,
+                           const Slice& key) const = 0;
+
+    virtual ~KeyComparator() { }
+  };
+
+  explicit MemTableRep(Arena* arena) : arena_(arena) {}
+
+  // Allocate a buf of len size for storing key. The idea is that a specific
+  // memtable representation knows its underlying data structure better. By
+  // allowing it to allocate memory, it can possibly put correlated stuff
+  // in consecutive memory area to make processor prefetching more efficient.
+  virtual KeyHandle Allocate(const size_t len, char** buf);
+
+  // Insert key into the collection. (The caller will pack key and value into a
+  // single buffer and pass that in as the parameter to Insert).
+  // REQUIRES: nothing that compares equal to key is currently in the
+  // collection.
+  virtual void Insert(KeyHandle handle) = 0;
+
+  // Returns true iff an entry that compares equal to key is in the collection.
+  virtual bool Contains(const char* key) const = 0;
+
+  // Notify this table rep that it will no longer be added to. By default, does
+  // nothing.
+  virtual void MarkReadOnly() { }
+
+  // Look up key from the mem table, since the first key in the mem table whose
+  // user_key matches the one given k, call the function callback_func(), with
+  // callback_args directly forwarded as the first parameter, and the mem table
+  // key as the second parameter. If the return value is false, then terminates.
+  // Otherwise, go through the next key.
+  // It's safe for Get() to terminate after having finished all the potential
+  // key for the k.user_key(), or not.
+  //
+  // Default:
+  // Get() function with a default value of dynamically construct an iterator,
+  // seek and call the call back function.
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg, const char* entry));
+
+  // Report an approximation of how much memory has been used other than memory
+  // that was allocated through the arena.
+  virtual size_t ApproximateMemoryUsage() = 0;
+
+  virtual ~MemTableRep() { }
+
+  // Iteration over the contents of a skip collection
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified collection.
+    // The returned iterator is not valid.
+    // explicit Iterator(const MemTableRep* collection);
+    virtual ~Iterator() {}
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const = 0;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const = 0;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() = 0;
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() = 0;
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() = 0;
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() = 0;
+  };
+
+  // Return an iterator over the keys in this representation.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
+
+  // Return an iterator over at least the keys with the specified user key. The
+  // iterator may also allow access to other keys, but doesn't have to. Default:
+  // GetIterator().
+  virtual Iterator* GetIterator(const Slice& user_key) {
+    return GetIterator(nullptr);
+  }
+
+  // Return an iterator that has a special Seek semantics. The result of
+  // a Seek might only include keys with the same prefix as the target key.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
+    return GetIterator(arena);
+  }
+
+  // Return true if the current MemTableRep supports merge operator.
+  // Default: true
+  virtual bool IsMergeOperatorSupported() const { return true; }
+
+  // Return true if the current MemTableRep supports snapshot
+  // Default: true
+  virtual bool IsSnapshotSupported() const { return true; }
+
+ protected:
+  // When *key is an internal key concatenated with the value, returns the
+  // user key.
+  virtual Slice UserKey(const char* key) const;
+
+  Arena* arena_;
+};
+
+// This is the base class for all factories that are used by RocksDB to create
+// new MemTableRep objects
+class MemTableRepFactory {
+ public:
+  virtual ~MemTableRepFactory() {}
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         Arena*, const SliceTransform*,
+                                         Logger* logger) = 0;
+  virtual const char* Name() const = 0;
+};
+
+// This uses a skip list to store keys. It is the default.
+class SkipListFactory : public MemTableRepFactory {
+ public:
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         Arena*, const SliceTransform*,
+                                         Logger* logger) override;
+  virtual const char* Name() const override { return "SkipListFactory"; }
+};
+
+#ifndef ROCKSDB_LITE
+// This creates MemTableReps that are backed by an std::vector. On iteration,
+// the vector is sorted. This is useful for workloads where iteration is very
+// rare and writes are generally not issued after reads begin.
+//
+// Parameters:
+//   count: Passed to the constructor of the underlying std::vector of each
+//     VectorRep. On initialization, the underlying array will be at least count
+//     bytes reserved for usage.
+class VectorRepFactory : public MemTableRepFactory {
+  const size_t count_;
+
+ public:
+  explicit VectorRepFactory(size_t count = 0) : count_(count) { }
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         Arena*, const SliceTransform*,
+                                         Logger* logger) override;
+  virtual const char* Name() const override {
+    return "VectorRepFactory";
+  }
+};
+
+// This class contains a fixed array of buckets, each
+// pointing to a skiplist (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+// skiplist_height: the max height of the skiplist
+// skiplist_branching_factor: probabilistic size ratio between adjacent
+//                            link lists in the skiplist
+extern MemTableRepFactory* NewHashSkipListRepFactory(
+    size_t bucket_count = 1000000, int32_t skiplist_height = 4,
+    int32_t skiplist_branching_factor = 4
+);
+
+// The factory is to create memtables with a hashed linked list:
+// it contains a fixed array of buckets, each pointing to a sorted single
+// linked list (null if the bucket is empty).
+// @bucket_count: number of fixed array buckets
+// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
+extern MemTableRepFactory* NewHashLinkListRepFactory(
+    size_t bucket_count = 50000, size_t huge_page_tlb_size = 0);
+
+// This factory creates a cuckoo-hashing based mem-table representation.
+// Cuckoo-hash is a closed-hash strategy, in which all key/value pairs
+// are stored in the bucket array itself intead of in some data structures
+// external to the bucket array.  In addition, each key in cuckoo hash
+// has a constant number of possible buckets in the bucket array.  These
+// two properties together makes cuckoo hash more memory efficient and
+// a constant worst-case read time.  Cuckoo hash is best suitable for
+// point-lookup workload.
+//
+// When inserting a key / value, it first checks whether one of its possible
+// buckets is empty.  If so, the key / value will be inserted to that vacant
+// bucket.  Otherwise, one of the keys originally stored in one of these
+// possible buckets will be "kicked out" and move to one of its possible
+// buckets (and possibly kicks out another victim.)  In the current
+// implementation, such "kick-out" path is bounded.  If it cannot find a
+// "kick-out" path for a specific key, this key will be stored in a backup
+// structure, and the current memtable to be forced to immutable.
+//
+// Note that currently this mem-table representation does not support
+// snapshot (i.e., it only queries latest state) and iterators.  In addition,
+// MultiGet operation might also lose its atomicity due to the lack of
+// snapshot support.
+//
+// Parameters:
+//   write_buffer_size: the write buffer size in bytes.
+//   average_data_size: the average size of key + value in bytes.  This value
+//     together with write_buffer_size will be used to compute the number
+//     of buckets.
+//   hash_function_count: the number of hash functions that will be used by
+//     the cuckoo-hash.  The number also equals to the number of possible
+//     buckets each key will have.
+extern MemTableRepFactory* NewHashCuckooRepFactory(
+    size_t write_buffer_size, size_t average_data_size = 64,
+    unsigned int hash_function_count = 4);
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
--- a/include/rocksdb/merge_operator.h
+++ b/include/rocksdb/merge_operator.h
@@ -0,0 +1,182 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
+
+#include <memory>
+#include <string>
+#include <deque>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class Slice;
+class Logger;
+
+// The Merge Operator
+//
+// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
+// client knows. It could be numeric addition, list append, string
+// concatenation, edit data structure, ... , anything.
+// The library, on the other hand, is concerned with the exercise of this
+// interface, at the right time (during get, iteration, compaction...)
+//
+// To use merge, the client needs to provide an object implementing one of
+// the following interfaces:
+//  a) AssociativeMergeOperator - for most simple semantics (always take
+//    two values, and merge them into one value, which is then put back
+//    into rocksdb); numeric addition and string concatenation are examples;
+//
+//  b) MergeOperator - the generic class for all the more abstract / complex
+//    operations; one method (FullMerge) to merge a Put/Delete value with a
+//    merge operand; and another method (PartialMerge) that merges multiple
+//    operands together. this is especially useful if your key values have
+//    complex structures but you would still like to support client-specific
+//    incremental updates.
+//
+// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
+// more powerful.
+//
+// Refer to rocksdb-merge wiki for more details and example implementations.
+//
+class MergeOperator {
+ public:
+  virtual ~MergeOperator() {}
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:      (IN)    The key that's associated with this merge operation.
+  //                   Client could multiplex the merge operator based on it
+  //                   if the key space is partitioned and different subspaces
+  //                   refer to different types of data which have different
+  //                   merge operation semantics
+  // existing: (IN)    null indicates that the key does not exist before this op
+  // operand_list:(IN) the sequence of merge operations to apply, front() first.
+  // new_value:(OUT)   Client is responsible for filling the merge result here
+  // logger:   (IN)    Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. This will be treated as an error by the library.
+  //
+  // Also make use of the *logger for error messages.
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const = 0;
+
+  // This function performs merge(left_op, right_op)
+  // when both the operands are themselves merge operation types
+  // that you would have passed to a DB::Merge() call in the same order
+  // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
+  //
+  // PartialMerge should combine them into a single merge operation that is
+  // saved into *new_value, and then it should return true.
+  // *new_value should be constructed such that a call to
+  // DB::Merge(key, *new_value) would yield the same result as a call
+  // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
+  //
+  // The default implementation of PartialMergeMulti will use this function
+  // as a helper, for backward compatibility.  Any successor class of
+  // MergeOperator should either implement PartialMerge or PartialMergeMulti,
+  // although implementing PartialMergeMulti is suggested as it is in general
+  // more effective to merge multiple operands at a time instead of two
+  // operands at a time.
+  //
+  // If it is impossible or infeasible to combine the two operations,
+  // leave new_value unchanged and return false. The library will
+  // internally keep track of the operations, and apply them in the
+  // correct order once a base-value (a Put/Delete/End-of-Database) is seen.
+  //
+  // TODO: Presently there is no way to differentiate between error/corruption
+  // and simply "return false". For now, the client should simply return
+  // false in any case it cannot perform partial-merge, regardless of reason.
+  // If there is corruption in the data, handle it in the FullMerge() function,
+  // and return false there.  The default implementation of PartialMerge will
+  // always return false.
+  virtual bool PartialMerge(const Slice& key, const Slice& left_operand,
+                            const Slice& right_operand, std::string* new_value,
+                            Logger* logger) const {
+    return false;
+  }
+
+  // This function performs merge when all the operands are themselves merge
+  // operation types that you would have passed to a DB::Merge() call in the
+  // same order (front() first)
+  // (i.e. DB::Merge(key, operand_list[0]), followed by
+  //  DB::Merge(key, operand_list[1]), ...)
+  //
+  // PartialMergeMulti should combine them into a single merge operation that is
+  // saved into *new_value, and then it should return true.  *new_value should
+  // be constructed such that a call to DB::Merge(key, *new_value) would yield
+  // the same result as subquential individual calls to DB::Merge(key, operand)
+  // for each operand in operand_list from front() to back().
+  //
+  // The PartialMergeMulti function will be called only when the list of
+  // operands are long enough. The minimum amount of operands that will be
+  // passed to the function are specified by the "min_partial_merge_operands"
+  // option.
+  //
+  // In the default implementation, PartialMergeMulti will invoke PartialMerge
+  // multiple times, where each time it only merges two operands.  Developers
+  // should either implement PartialMergeMulti, or implement PartialMerge which
+  // is served as the helper function of the default PartialMergeMulti.
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value, Logger* logger) const;
+
+  // The name of the MergeOperator. Used to check for MergeOperator
+  // mismatches (i.e., a DB created with one MergeOperator is
+  // accessed using a different MergeOperator)
+  // TODO: the name is currently not stored persistently and thus
+  //       no checking is enforced. Client is responsible for providing
+  //       consistent MergeOperator between DB opens.
+  virtual const char* Name() const = 0;
+};
+
+// The simpler, associative merge operator.
+class AssociativeMergeOperator : public MergeOperator {
+ public:
+  virtual ~AssociativeMergeOperator() {}
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:           (IN) The key that's associated with this merge operation.
+  // existing_value:(IN) null indicates the key does not exist before this op
+  // value:         (IN) the value to update/merge the existing_value with
+  // new_value:    (OUT) Client is responsible for filling the merge result here
+  // logger:        (IN) Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. The client should assume that this will be treated
+  // as an error by the library.
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const = 0;
+
+
+ private:
+  // Default implementations of the MergeOperator functions
+  virtual bool FullMerge(const Slice& key,
+                         const Slice* existing_value,
+                         const std::deque<std::string>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const override;
+
+  virtual bool PartialMerge(const Slice& key,
+                            const Slice& left_operand,
+                            const Slice& right_operand,
+                            std::string* new_value,
+                            Logger* logger) const override;
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -0,0 +1,975 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+
+#include <stddef.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+
+#include "rocksdb/version.h"
+#include "rocksdb/universal_compaction.h"
+
+namespace rocksdb {
+
+class Cache;
+class CompactionFilter;
+class CompactionFilterFactory;
+class CompactionFilterFactoryV2;
+class Comparator;
+class Env;
+enum InfoLogLevel : unsigned char;
+class FilterPolicy;
+class Logger;
+class MergeOperator;
+class Snapshot;
+class TableFactory;
+class MemTableRepFactory;
+class TablePropertiesCollectorFactory;
+class Slice;
+class SliceTransform;
+class Statistics;
+class InternalKeyComparator;
+
+using std::shared_ptr;
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs.  Each block may be compressed before
+// being stored in a file.  The following enum describes which
+// compression method (if any) is used to compress a block.
+enum CompressionType : char {
+  // NOTE: do not change the values of existing entries, as these are
+  // part of the persistent format on disk.
+  kNoCompression = 0x0, kSnappyCompression = 0x1, kZlibCompression = 0x2,
+  kBZip2Compression = 0x3, kLZ4Compression = 0x4, kLZ4HCCompression = 0x5
+};
+
+enum CompactionStyle : char {
+  kCompactionStyleLevel = 0x0,      // level based compaction style
+  kCompactionStyleUniversal = 0x1,  // Universal compaction style
+  kCompactionStyleFIFO = 0x2,       // FIFO compaction style
+};
+
+struct CompactionOptionsFIFO {
+  // once the total sum of table files reaches this, we will delete the oldest
+  // table file
+  // Default: 1GB
+  uint64_t max_table_files_size;
+
+  CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
+};
+
+// Compression options for different compression algorithms like Zlib
+struct CompressionOptions {
+  int window_bits;
+  int level;
+  int strategy;
+  CompressionOptions() : window_bits(-14), level(-1), strategy(0) {}
+  CompressionOptions(int wbits, int _lev, int _strategy)
+      : window_bits(wbits), level(_lev), strategy(_strategy) {}
+};
+
+enum UpdateStatus {    // Return status For inplace update callback
+  UPDATE_FAILED   = 0, // Nothing to update
+  UPDATED_INPLACE = 1, // Value updated inplace
+  UPDATED         = 2, // No inplace update. Merged value set
+};
+
+struct Options;
+
+struct ColumnFamilyOptions {
+  // Some functions that make it easier to optimize RocksDB
+
+  // Use this if you don't need to keep the data sorted, i.e. you'll never use
+  // an iterator, only Put() and Get() API calls
+  ColumnFamilyOptions* OptimizeForPointLookup();
+
+  // Default values for some parameters in ColumnFamilyOptions are not
+  // optimized for heavy workloads and big datasets, which means you might
+  // observe write stalls under some conditions. As a starting point for tuning
+  // RocksDB options, use the following two functions:
+  // * OptimizeLevelStyleCompaction -- optimizes level style compaction
+  // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
+  // Universal style compaction is focused on reducing Write Amplification
+  // Factor for big data sets, but increases Space Amplification. You can learn
+  // more about the different styles here:
+  // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
+  // Make sure to also call IncreaseParallelism(), which will provide the
+  // biggest performance gains.
+  // Note: we might use more memory than memtable_memory_budget during high
+  // write rate period
+  ColumnFamilyOptions* OptimizeLevelStyleCompaction(
+      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+  ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
+      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator;
+
+  // REQUIRES: The client must provide a merge operator if Merge operation
+  // needs to be accessed. Calling Merge on a DB without a merge operator
+  // would result in Status::NotSupported. The client must ensure that the
+  // merge operator supplied here has the same name and *exactly* the same
+  // semantics as the merge operator provided to previous open calls on
+  // the same DB. The only exception is reserved for upgrade, where a DB
+  // previously without a merge operator is introduced to Merge operation
+  // for the first time. It's necessary to specify a merge operator when
+  // openning the DB in this case.
+  // Default: nullptr
+  shared_ptr<MergeOperator> merge_operator;
+
+  // A single CompactionFilter instance to call into during compaction.
+  // Allows an application to modify/delete a key-value during background
+  // compaction.
+  //
+  // If the client requires a new compaction filter to be used for different
+  // compaction runs, it can specify compaction_filter_factory instead of this
+  // option.  The client should specify only one of the two.
+  // compaction_filter takes precedence over compaction_filter_factory if
+  // client specifies both.
+  //
+  // If multithreaded compaction is being used, the supplied CompactionFilter
+  // instance may be used from different threads concurrently and so should be
+  // thread-safe.
+  //
+  // Default: nullptr
+  const CompactionFilter* compaction_filter;
+
+  // This is a factory that provides compaction filter objects which allow
+  // an application to modify/delete a key-value during background compaction.
+  //
+  // A new filter will be created on each compaction run.  If multithreaded
+  // compaction is being used, each created CompactionFilter will only be used
+  // from a single thread and so does not need to be thread-safe.
+  //
+  // Default: a factory that doesn't provide any object
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
+
+  // Version TWO of the compaction_filter_factory
+  // It supports rolling compaction
+  //
+  // Default: a factory that doesn't provide any object
+  std::shared_ptr<CompactionFilterFactoryV2> compaction_filter_factory_v2;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory (backed by an unsorted log
+  // on disk) before converting to a sorted on-disk file.
+  //
+  // Larger values increase performance, especially during bulk loads.
+  // Up to max_write_buffer_number write buffers may be held in memory
+  // at the same time,
+  // so you may wish to adjust this parameter to control memory usage.
+  // Also, a larger write buffer will result in a longer recovery time
+  // the next time the database is opened.
+  //
+  // Default: 4MB
+  size_t write_buffer_size;
+
+  // The maximum number of write buffers that are built up in memory.
+  // The default is 2, so that when 1 write buffer is being flushed to
+  // storage, new writes can continue to the other write buffer.
+  // Default: 2
+  int max_write_buffer_number;
+
+  // The minimum number of write buffers that will be merged together
+  // before writing to storage.  If set to 1, then
+  // all write buffers are fushed to L0 as individual files and this increases
+  // read amplification because a get request has to check in all of these
+  // files. Also, an in-memory merge may result in writing lesser
+  // data to storage if there are duplicate records in each of these
+  // individual write buffers.  Default: 1
+  int min_write_buffer_number_to_merge;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // If non-NULL use the specified cache for blocks.
+  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
+  // Default: nullptr
+  shared_ptr<Cache> block_cache;
+
+  // If non-NULL use the specified cache for compressed blocks.
+  // If NULL, rocksdb will not use a compressed block cache.
+  // Default: nullptr
+  shared_ptr<Cache> block_cache_compressed;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  //
+  // Default: 4K
+  size_t block_size;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  //
+  // Default: 16
+  int block_restart_interval;
+
+  // Compress blocks using the specified compression algorithm.  This
+  // parameter can be changed dynamically.
+  //
+  // Default: kSnappyCompression, which gives lightweight but fast
+  // compression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  CompressionType compression;
+
+  // Different levels can have different compression policies. There
+  // are cases where most lower levels would like to quick compression
+  // algorithm while the higher levels (which have more data) use
+  // compression algorithms that have better compression but could
+  // be slower. This array, if non nullptr, should have an entry for
+  // each level of the database. This array, if non nullptr, overides the
+  // value specified in the previous field 'compression'. The caller is
+  // reponsible for allocating memory and initializing the values in it
+  // before invoking Open(). The caller is responsible for freeing this
+  // array and it could be freed anytime after the return from Open().
+  // This could have been a std::vector but that makes the equivalent
+  // java/C api hard to construct.
+  std::vector<CompressionType> compression_per_level;
+
+  // different options for compression algorithms
+  CompressionOptions compression_opts;
+
+  // If non-nullptr, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  //
+  // Default: nullptr
+  const FilterPolicy* filter_policy;
+
+  // If non-nullptr, use the specified function to determine the
+  // prefixes for keys.  These prefixes will be placed in the filter.
+  // Depending on the workload, this can reduce the number of read-IOP
+  // cost for scans when a prefix is passed via ReadOptions to
+  // db.NewIterator().  For prefix filtering to work properly,
+  // "prefix_extractor" and "comparator" must be such that the following
+  // properties hold:
+  //
+  // 1) key.starts_with(prefix(key))
+  // 2) Compare(prefix(key), key) <= 0.
+  // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
+  // 4) prefix(prefix(key)) == prefix(key)
+  //
+  // Default: nullptr
+  std::shared_ptr<const SliceTransform> prefix_extractor;
+
+  // If true, place whole keys in the filter (not just prefixes).
+  // This must generally be true for gets to be efficient.
+  //
+  // Default: true
+  bool whole_key_filtering;
+
+  // Number of levels for this database
+  int num_levels;
+
+  // Number of files to trigger level-0 compaction. A value <0 means that
+  // level-0 compaction will not be triggered by number of files at all.
+  //
+  // Default: 4
+  int level0_file_num_compaction_trigger;
+
+  // Soft limit on number of level-0 files. We start slowing down writes at this
+  // point. A value <0 means that no writing slow down will be triggered by
+  // number of files in level-0.
+  int level0_slowdown_writes_trigger;
+
+  // Maximum number of level-0 files.  We stop writes at this point.
+  int level0_stop_writes_trigger;
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.  We try to push to level 2 to avoid the
+  // relatively expensive level 0=>1 compactions and to avoid some
+  // expensive manifest file operations.  We do not push all the way to
+  // the largest level since that can generate a lot of wasted disk
+  // space if the same key space is being repeatedly overwritten.
+  int max_mem_compaction_level;
+
+  // Target file size for compaction.
+  // target_file_size_base is per-file size for level-1.
+  // Target file size for level L can be calculated by
+  // target_file_size_base * (target_file_size_multiplier ^ (L-1))
+  // For example, if target_file_size_base is 2MB and
+  // target_file_size_multiplier is 10, then each file on level-1 will
+  // be 2MB, and each file on level 2 will be 20MB,
+  // and each file on level-3 will be 200MB.
+
+  // by default target_file_size_base is 2MB.
+  int target_file_size_base;
+  // by default target_file_size_multiplier is 1, which means
+  // by default files in different levels will have similar size.
+  int target_file_size_multiplier;
+
+  // Control maximum total data size for a level.
+  // max_bytes_for_level_base is the max total for level-1.
+  // Maximum number of bytes for level L can be calculated as
+  // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
+  // For example, if max_bytes_for_level_base is 20MB, and if
+  // max_bytes_for_level_multiplier is 10, total data size for level-1
+  // will be 20MB, total file size for level-2 will be 200MB,
+  // and total file size for level-3 will be 2GB.
+
+  // by default 'max_bytes_for_level_base' is 10MB.
+  uint64_t max_bytes_for_level_base;
+  // by default 'max_bytes_for_level_base' is 10.
+  int max_bytes_for_level_multiplier;
+
+  // Different max-size multipliers for different levels.
+  // These are multiplied by max_bytes_for_level_multiplier to arrive
+  // at the max-size of each level.
+  // Default: 1
+  std::vector<int> max_bytes_for_level_multiplier_additional;
+
+  // Maximum number of bytes in all compacted files.  We avoid expanding
+  // the lower level file set of a compaction if it would make the
+  // total compaction cover more than
+  // (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+  int expanded_compaction_factor;
+
+  // Maximum number of bytes in all source files to be compacted in a
+  // single compaction run. We avoid picking too many files in the
+  // source level so that we do not exceed the total source bytes
+  // for compaction to exceed
+  // (source_compaction_factor * targetFileSizeLevel()) many bytes.
+  // Default:1, i.e. pick maxfilesize amount of data as the source of
+  // a compaction.
+  int source_compaction_factor;
+
+  // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+  // stop building a single file in a level->level+1 compaction.
+  int max_grandparent_overlap_factor;
+
+  // Disable compaction triggered by seek.
+  // With bloomfilter and fast storage, a miss on one level
+  // is very cheap if the file handle is cached in table cache
+  // (which is true if max_open_files is large).
+  // Default: true
+  bool disable_seek_compaction;
+
+  // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
+  // soft_rate_limit. This is ignored when == 0.0.
+  // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
+  // hold, RocksDB will set soft_rate_limit = hard_rate_limit
+  // Default: 0 (disabled)
+  double soft_rate_limit;
+
+  // Puts are delayed 1ms at a time when any level has a compaction score that
+  // exceeds hard_rate_limit. This is ignored when <= 1.0.
+  // Default: 0 (disabled)
+  double hard_rate_limit;
+
+  // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
+  // there is no limit.
+  // Default: 1000
+  unsigned int rate_limit_delay_max_milliseconds;
+
+  // Disable block cache. If this is set to true,
+  // then no block cache should be used, and the block_cache should
+  // point to a nullptr object.
+  // Default: false
+  bool no_block_cache;
+
+  // size of one block in arena memory allocation.
+  // If <= 0, a proper value is automatically calculated (usually 1/10 of
+  // writer_buffer_size).
+  //
+  // There are two additonal restriction of the The specified size:
+  // (1) size should be in the range of [4096, 2 << 30] and
+  // (2) be the multiple of the CPU word (which helps with the memory
+  // alignment).
+  //
+  // We'll automatically check and adjust the size number to make sure it
+  // conforms to the restrictions.
+  //
+  // Default: 0
+  size_t arena_block_size;
+
+  // Disable automatic compactions. Manual compactions can still
+  // be issued on this column family
+  bool disable_auto_compactions;
+
+  // Purge duplicate/deleted keys when a memtable is flushed to storage.
+  // Default: true
+  bool purge_redundant_kvs_while_flush;
+
+  // This is used to close a block before it reaches the configured
+  // 'block_size'. If the percentage of free space in the current block is less
+  // than this specified number and adding a new record to the block will
+  // exceed the configured block size, then this block will be closed and the
+  // new record will be written to the next block.
+  // Default is 10.
+  int block_size_deviation;
+
+  // The compaction style. Default: kCompactionStyleLevel
+  CompactionStyle compaction_style;
+
+  // If true, compaction will verify checksum on every read that happens
+  // as part of compaction
+  // Default: true
+  bool verify_checksums_in_compaction;
+
+  // The options needed to support Universal Style compactions
+  CompactionOptionsUniversal compaction_options_universal;
+
+  // The options for FIFO compaction style
+  CompactionOptionsFIFO compaction_options_fifo;
+
+  // Use KeyMayExist API to filter deletes when this is true.
+  // If KeyMayExist returns false, i.e. the key definitely does not exist, then
+  // the delete is a noop. KeyMayExist only incurs in-memory look up.
+  // This optimization avoids writing the delete to storage when appropriate.
+  // Default: false
+  bool filter_deletes;
+
+  // An iteration->Next() sequentially skips over keys with the same
+  // user-key unless this option is set. This number specifies the number
+  // of keys (with the same userkey) that will be sequentially
+  // skipped before a reseek is issued.
+  // Default: 8
+  uint64_t max_sequential_skip_in_iterations;
+
+  // This is a factory that provides MemTableRep objects.
+  // Default: a factory that provides a skip-list-based implementation of
+  // MemTableRep.
+  std::shared_ptr<MemTableRepFactory> memtable_factory;
+
+  // This is a factory that provides TableFactory objects.
+  // Default: a factory that provides a default implementation of
+  // Table and TableBuilder.
+  std::shared_ptr<TableFactory> table_factory;
+
+  // This option allows user to to collect their own interested statistics of
+  // the tables.
+  // Default: empty vector -- no user-defined statistics collection will be
+  // performed.
+  typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      TablePropertiesCollectorFactories;
+  TablePropertiesCollectorFactories table_properties_collector_factories;
+
+  // Allows thread-safe inplace updates.
+  // If inplace_callback function is not set,
+  //   Put(key, new_value) will update inplace the existing_value iff
+  //   * key exists in current memtable
+  //   * new sizeof(new_value) <= sizeof(existing_value)
+  //   * existing_value for that key is a put i.e. kTypeValue
+  // If inplace_callback function is set, check doc for inplace_callback.
+  // Default: false.
+  bool inplace_update_support;
+
+  // Number of locks used for inplace update
+  // Default: 10000, if inplace_update_support = true, else 0.
+  size_t inplace_update_num_locks;
+
+  // existing_value - pointer to previous value (from both memtable and sst).
+  //                  nullptr if key doesn't exist
+  // existing_value_size - pointer to size of existing_value).
+  //                       nullptr if key doesn't exist
+  // delta_value - Delta value to be merged with the existing_value.
+  //               Stored in transaction logs.
+  // merged_value - Set when delta is applied on the previous value.
+
+  // Applicable only when inplace_update_support is true,
+  // this callback function is called at the time of updating the memtable
+  // as part of a Put operation, lets say Put(key, delta_value). It allows the
+  // 'delta_value' specified as part of the Put operation to be merged with
+  // an 'existing_value' of the key in the database.
+
+  // If the merged value is smaller in size that the 'existing_value',
+  // then this function can update the 'existing_value' buffer inplace and
+  // the corresponding 'existing_value'_size pointer, if it wishes to.
+  // The callback should return UpdateStatus::UPDATED_INPLACE.
+  // In this case. (In this case, the snapshot-semantics of the rocksdb
+  // Iterator is not atomic anymore).
+
+  // If the merged value is larger in size than the 'existing_value' or the
+  // application does not wish to modify the 'existing_value' buffer inplace,
+  // then the merged value should be returned via *merge_value. It is set by
+  // merging the 'existing_value' and the Put 'delta_value'. The callback should
+  // return UpdateStatus::UPDATED in this case. This merged value will be added
+  // to the memtable.
+
+  // If merging fails or the application does not wish to take any action,
+  // then the callback should return UpdateStatus::UPDATE_FAILED.
+
+  // Please remember that the original call from the application is Put(key,
+  // delta_value). So the transaction log (if enabled) will still contain (key,
+  // delta_value). The 'merged_value' is not stored in the transaction log.
+  // Hence the inplace_callback function should be consistent across db reopens.
+
+  // Default: nullptr
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+
+  // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
+  // for memtable
+  uint32_t memtable_prefix_bloom_bits;
+
+  // number of hash probes per key
+  uint32_t memtable_prefix_bloom_probes;
+
+  // Page size for huge page TLB for bloom in memtable. If <=0, not allocate
+  // from huge page TLB but from malloc.
+  // Need to reserve huge pages for it to be allocated. For example:
+  //      sysctl -w vm.nr_hugepages=20
+  // See linux doc Documentation/vm/hugetlbpage.txt
+
+  size_t memtable_prefix_bloom_huge_page_tlb_size;
+
+  // Control locality of bloom filter probes to improve cache miss rate.
+  // This option only applies to memtable prefix bloom and plaintable
+  // prefix bloom. It essentially limits every bloom checking to one cache line.
+  // This optimization is turned off when set to 0, and positive number to turn
+  // it on.
+  // Default: 0
+  uint32_t bloom_locality;
+
+  // Maximum number of successive merge operations on a key in the memtable.
+  //
+  // When a merge operation is added to the memtable and the maximum number of
+  // successive merges is reached, the value of the key will be calculated and
+  // inserted into the memtable instead of the merge operation. This will
+  // ensure that there are never more than max_successive_merges merge
+  // operations in the memtable.
+  //
+  // Default: 0 (disabled)
+  size_t max_successive_merges;
+
+  // The number of partial merge operands to accumulate before partial
+  // merge will be performed. Partial merge will not be called
+  // if the list of values to merge is less than min_partial_merge_operands.
+  //
+  // If min_partial_merge_operands < 2, then it will be treated as 2.
+  //
+  // Default: 2
+  uint32_t min_partial_merge_operands;
+
+  // Create ColumnFamilyOptions with default values for all fields
+  ColumnFamilyOptions();
+  // Create ColumnFamilyOptions from Options
+  explicit ColumnFamilyOptions(const Options& options);
+
+  void Dump(Logger* log) const;
+};
+
+struct DBOptions {
+  // Some functions that make it easier to optimize RocksDB
+
+  // By default, RocksDB uses only one background thread for flush and
+  // compaction. Calling this function will set it up such that total of
+  // `total_threads` is used. Good value for `total_threads` is the number of
+  // cores. You almost definitely want to call this function if your system is
+  // bottlenecked by RocksDB.
+  DBOptions* IncreaseParallelism(int total_threads = 16);
+
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists;
+
+  // If true, the implementation will do aggressive checking of the
+  // data it is processing and will stop early if it detects any
+  // errors.  This may have unforeseen ramifications: for example, a
+  // corruption of one DB entry may cause a large number of entries to
+  // become unreadable or for the entire DB to become unopenable.
+  // If any of the  writes to the database fails (Put, Delete, Merge, Write),
+  // the database will switch to read-only mode and fail all other
+  // Write operations.
+  // Default: true
+  bool paranoid_checks;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc.
+  // Default: Env::Default()
+  Env* env;
+
+  // Any internal progress/error information generated by the db will
+  // be written to info_log if it is non-nullptr, or to a file stored
+  // in the same directory as the DB contents if info_log is nullptr.
+  // Default: nullptr
+  shared_ptr<Logger> info_log;
+
+  InfoLogLevel info_log_level;
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set. Value -1 means
+  // files opened are always kept open. You can estimate number of files based
+  // on target_file_size_base and target_file_size_multiplier for level-based
+  // compaction. For universal-style compaction, you can usually set it to -1.
+  // Default: 5000
+  int max_open_files;
+
+  // Once write-ahead logs exceed this size, we will start forcing the flush of
+  // column families whose memtables are backed by the oldest live WAL file
+  // (i.e. the ones that are causing all the space amplification). If set to 0
+  // (default), we will dynamically choose the WAL size limit to be
+  // [sum of all write_buffer_size * max_write_buffer_number] * 2
+  // Default: 0
+  uint64_t max_total_wal_size;
+
+  // If non-null, then we should collect metrics about database operations
+  // Statistics objects should not be shared between DB instances as
+  // it does not use any locks to prevent concurrent updates.
+  shared_ptr<Statistics> statistics;
+
+  // If true, then the contents of data files are not synced
+  // to stable storage. Their contents remain in the OS buffers till the
+  // OS decides to flush them. This option is good for bulk-loading
+  // of data. Once the bulk-loading is complete, please issue a
+  // sync to the OS to flush all dirty buffesrs to stable storage.
+  // Default: false
+  bool disableDataSync;
+
+  // If true, then every store to stable storage will issue a fsync.
+  // If false, then every store to stable storage will issue a fdatasync.
+  // This parameter should be set to true while storing data to
+  // filesystem like ext3 that can lose files after a reboot.
+  // Default: false
+  bool use_fsync;
+
+  // This number controls how often a new scribe log about
+  // db deploy stats is written out.
+  // -1 indicates no logging at all.
+  // Default value is 1800 (half an hour).
+  int db_stats_log_interval;
+
+  // This specifies the info LOG dir.
+  // If it is empty, the log files will be in the same dir as data.
+  // If it is non empty, the log files will be in the specified dir,
+  // and the db data dir's absolute path will be used as the log file
+  // name's prefix.
+  std::string db_log_dir;
+
+  // This specifies the absolute dir path for write-ahead logs (WAL).
+  // If it is empty, the log files will be in the same dir as data,
+  //   dbname is used as the data dir by default
+  // If it is non empty, the log files will be in kept the specified dir.
+  // When destroying the db,
+  //   all log files in wal_dir and the dir itself is deleted
+  std::string wal_dir;
+
+  // The periodicity when obsolete files get deleted. The default
+  // value is 6 hours. The files that get out of scope by compaction
+  // process will still get automatically delete on every compaction,
+  // regardless of this setting
+  uint64_t delete_obsolete_files_period_micros;
+
+  // Maximum number of concurrent background compaction jobs, submitted to
+  // the default LOW priority thread pool.
+  // If you're increasing this, also consider increasing number of threads in
+  // LOW priority thread pool. For more information, see
+  // Env::SetBackgroundThreads
+  // Default: 1
+  int max_background_compactions;
+
+  // Maximum number of concurrent background memtable flush jobs, submitted to
+  // the HIGH priority thread pool.
+  //
+  // By default, all background jobs (major compaction and memtable flush) go
+  // to the LOW priority pool. If this option is set to a positive number,
+  // memtable flush jobs will be submitted to the HIGH priority pool.
+  // It is important when the same Env is shared by multiple db instances.
+  // Without a separate pool, long running major compaction jobs could
+  // potentially block memtable flush jobs of other db instances, leading to
+  // unnecessary Put stalls.
+  //
+  // If you're increasing this, also consider increasing number of threads in
+  // HIGH priority thread pool. For more information, see
+  // Env::SetBackgroundThreads
+  // Default: 1
+  int max_background_flushes;
+
+  // Specify the maximal size of the info log file. If the log file
+  // is larger than `max_log_file_size`, a new info log file will
+  // be created.
+  // If max_log_file_size == 0, all logs will be written to one
+  // log file.
+  size_t max_log_file_size;
+
+  // Time for the info log file to roll (in seconds).
+  // If specified with non-zero value, log file will be rolled
+  // if it has been active longer than `log_file_time_to_roll`.
+  // Default: 0 (disabled)
+  size_t log_file_time_to_roll;
+
+  // Maximal info log files to be kept.
+  // Default: 1000
+  size_t keep_log_file_num;
+
+  // manifest file is rolled over on reaching this limit.
+  // The older manifest file be deleted.
+  // The default value is MAX_INT so that roll-over does not take place.
+  uint64_t max_manifest_file_size;
+
+  // Number of shards used for table cache.
+  int table_cache_numshardbits;
+
+  // During data eviction of table's LRU cache, it would be inefficient
+  // to strictly follow LRU because this piece of memory will not really
+  // be released unless its refcount falls to zero. Instead, make two
+  // passes: the first pass will release items with refcount = 1,
+  // and if not enough space releases after scanning the number of
+  // elements specified by this parameter, we will remove items in LRU
+  // order.
+  int table_cache_remove_scan_count_limit;
+
+  // The following two fields affect how archived logs will be deleted.
+  // 1. If both set to 0, logs will be deleted asap and will not get into
+  //    the archive.
+  // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+  //    WAL files will be checked every 10 min and if total size is greater
+  //    then WAL_size_limit_MB, they will be deleted starting with the
+  //    earliest until size_limit is met. All empty files will be deleted.
+  // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+  //    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+  //    are older than WAL_ttl_seconds will be deleted.
+  // 4. If both are not 0, WAL files will be checked every 10 min and both
+  //    checks will be performed with ttl being first.
+  uint64_t WAL_ttl_seconds;
+  uint64_t WAL_size_limit_MB;
+
+  // Number of bytes to preallocate (via fallocate) the manifest
+  // files.  Default is 4mb, which is reasonable to reduce random IO
+  // as well as prevent overallocation for mounts that preallocate
+  // large amounts of data (such as xfs's allocsize option).
+  size_t manifest_preallocation_size;
+
+  // Data being read from file storage may be buffered in the OS
+  // Default: true
+  bool allow_os_buffer;
+
+  // Allow the OS to mmap file for reading sst tables. Default: false
+  bool allow_mmap_reads;
+
+  // Allow the OS to mmap file for writing. Default: false
+  bool allow_mmap_writes;
+
+  // Disable child process inherit open files. Default: true
+  bool is_fd_close_on_exec;
+
+  // Skip log corruption error on recovery (If client is ok with
+  // losing most recent changes)
+  // Default: false
+  bool skip_log_error_on_recovery;
+
+  // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+  // Default: 3600 (1 hour)
+  unsigned int stats_dump_period_sec;
+
+  // If set true, will hint the underlying file system that the file
+  // access pattern is random, when a sst file is opened.
+  // Default: true
+  bool advise_random_on_open;
+
+  // Specify the file access pattern once a compaction is started.
+  // It will be applied to all input files of a compaction.
+  // Default: NORMAL
+  enum {
+    NONE,
+    NORMAL,
+    SEQUENTIAL,
+    WILLNEED
+  } access_hint_on_compaction_start;
+
+  // Use adaptive mutex, which spins in the user space before resorting
+  // to kernel. This could reduce context switch when the mutex is not
+  // heavily contended. However, if the mutex is hot, we could end up
+  // wasting spin time.
+  // Default: false
+  bool use_adaptive_mutex;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, asynchronously, in the background.
+  // Issue one request for every bytes_per_sync written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync;
+
+  // Allow RocksDB to use thread local storage to optimize performance.
+  // Default: true
+  bool allow_thread_local;
+
+  // Create DBOptions with default values for all fields
+  DBOptions();
+  // Create DBOptions from Options
+  explicit DBOptions(const Options& options);
+
+  void Dump(Logger* log) const;
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options : public DBOptions, public ColumnFamilyOptions {
+  // Create an Options object with default values for all fields.
+  Options() :
+    DBOptions(),
+    ColumnFamilyOptions() {}
+
+  Options(const DBOptions& db_options,
+          const ColumnFamilyOptions& column_family_options)
+      : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
+
+  void Dump(Logger* log) const;
+
+  // Set appropriate parameters for bulk loading.
+  // The reason that this is a function that returns "this" instead of a
+  // constructor is to enable chaining of multiple similar calls in the future.
+  //
+
+  // All data will be in level 0 without any automatic compaction.
+  // It's recommended to manually call CompactRange(NULL, NULL) before reading
+  // from the database, because otherwise the read can be very slow.
+  Options* PrepareForBulkLoad();
+};
+
+//
+// An application can issue a read request (via Get/Iterators) and specify
+// if that read should process data that ALREADY resides on a specified cache
+// level. For example, if an application specifies kBlockCacheTier then the
+// Get call will process data that is already processed in the memtable or
+// the block cache. It will not page in data from the OS cache or data that
+// resides in storage.
+enum ReadTier {
+  kReadAllTier = 0x0,    // data in memtable, block cache, OS cache or storage
+  kBlockCacheTier = 0x1  // data in memtable or block cache
+};
+
+// Options that control read operations
+struct ReadOptions {
+  // If true, all data read from underlying storage will be
+  // verified against corresponding checksums.
+  // Default: true
+  bool verify_checksums;
+
+  // Should the "data block"/"index block"/"filter block" read for this
+  // iteration be cached in memory?
+  // Callers may wish to set this field to false for bulk scans.
+  // Default: true
+  bool fill_cache;
+
+  // If this option is set and memtable implementation allows, Seek
+  // might only return keys with the same prefix as the seek-key
+  //
+  // ! DEPRECATED: prefix_seek is on by default when prefix_extractor
+  // is configured
+  // bool prefix_seek;
+
+  // If "snapshot" is non-nullptr, read as of the supplied snapshot
+  // (which must belong to the DB that is being read and which must
+  // not have been released).  If "snapshot" is nullptr, use an impliicit
+  // snapshot of the state at the beginning of this read operation.
+  // Default: nullptr
+  const Snapshot* snapshot;
+
+  // If "prefix" is non-nullptr, and ReadOptions is being passed to
+  // db.NewIterator, only return results when the key begins with this
+  // prefix.  This field is ignored by other calls (e.g., Get).
+  // Options.prefix_extractor must also be set, and
+  // prefix_extractor.InRange(prefix) must be true.  The iterator
+  // returned by NewIterator when this option is set will behave just
+  // as if the underlying store did not contain any non-matching keys,
+  // with two exceptions.  Seek() only accepts keys starting with the
+  // prefix, and SeekToLast() is not supported.  prefix filter with this
+  // option will sometimes reduce the number of read IOPs.
+  // Default: nullptr
+  //
+  // ! DEPRECATED
+  // const Slice* prefix;
+
+  // Specify if this read request should process data that ALREADY
+  // resides on a particular cache. If the required data is not
+  // found at the specified cache, then Status::Incomplete is returned.
+  // Default: kReadAllTier
+  ReadTier read_tier;
+
+  // Specify to create a tailing iterator -- a special iterator that has a
+  // view of the complete database (i.e. it can also be used to read newly
+  // added data) and is optimized for sequential reads. It will return records
+  // that were inserted into the database after the creation of the iterator.
+  // Default: false
+  // Not supported in ROCKSDB_LITE mode!
+  bool tailing;
+
+  ReadOptions()
+      : verify_checksums(true),
+        fill_cache(true),
+        snapshot(nullptr),
+        read_tier(kReadAllTier),
+        tailing(false) {}
+  ReadOptions(bool cksum, bool cache)
+      : verify_checksums(cksum),
+        fill_cache(cache),
+        snapshot(nullptr),
+        read_tier(kReadAllTier),
+        tailing(false) {}
+};
+
+// Options that control write operations
+struct WriteOptions {
+  // If true, the write will be flushed from the operating system
+  // buffer cache (by calling WritableFile::Sync()) before the write
+  // is considered complete.  If this flag is true, writes will be
+  // slower.
+  //
+  // If this flag is false, and the machine crashes, some recent
+  // writes may be lost.  Note that if it is just the process that
+  // crashes (i.e., the machine does not reboot), no writes will be
+  // lost even if sync==false.
+  //
+  // In other words, a DB write with sync==false has similar
+  // crash semantics as the "write()" system call.  A DB write
+  // with sync==true has similar crash semantics to a "write()"
+  // system call followed by "fdatasync()".
+  //
+  // Default: false
+  bool sync;
+
+  // If true, writes will not first go to the write ahead log,
+  // and the write may got lost after a crash.
+  bool disableWAL;
+
+  WriteOptions() : sync(false), disableWAL(false) {}
+};
+
+// Options that control flush operations
+struct FlushOptions {
+  // If true, the flush will wait until the flush is done.
+  // Default: true
+  bool wait;
+
+  FlushOptions() : wait(true) {}
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
+#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
+
+#include <stdint.h>
+#include <string>
+
+namespace rocksdb {
+
+enum PerfLevel {
+  kDisable        = 0,  // disable perf stats
+  kEnableCount    = 1,  // enable only count stats
+  kEnableTime     = 2   // enable time stats too
+};
+
+// set the perf stats level
+void SetPerfLevel(PerfLevel level);
+
+// A thread local context for gathering performance counter efficiently
+// and transparently.
+
+struct PerfContext {
+
+  void Reset(); // reset all performance counters to zero
+
+  std::string ToString() const;
+
+  uint64_t user_key_comparison_count; // total number of user key comparisons
+  uint64_t block_cache_hit_count;     // total number of block cache hits
+  uint64_t block_read_count;          // total number of block reads (with IO)
+  uint64_t block_read_byte;           // total number of bytes from block reads
+  uint64_t block_read_time;           // total time spent on block reads
+  uint64_t block_checksum_time;       // total time spent on block checksum
+  uint64_t block_decompress_time;     // total time spent on block decompression
+  // total number of internal keys skipped over during iteration (overwritten or
+  // deleted, to be more specific, hidden by a put or delete of the same key)
+  uint64_t internal_key_skipped_count;
+  // total number of deletes skipped over during iteration
+  uint64_t internal_delete_skipped_count;
+
+  uint64_t get_snapshot_time;          // total time spent on getting snapshot
+  uint64_t get_from_memtable_time;     // total time spent on querying memtables
+  uint64_t get_from_memtable_count;    // number of mem tables queried
+  // total time spent after Get() finds a key
+  uint64_t get_post_process_time;
+  uint64_t get_from_output_files_time; // total time reading from output files
+  // total time spent on seeking child iters
+  uint64_t seek_child_seek_time;
+  // number of seek issued in child iterators
+  uint64_t seek_child_seek_count;
+  uint64_t seek_min_heap_time;         // total time spent on the merge heap
+  // total time spent on seeking the internal entries
+  uint64_t seek_internal_seek_time;
+  // total time spent on iterating internal entries to find the next user entry
+  uint64_t find_next_user_entry_time;
+  // total time spent on pre or post processing when writing a record
+  uint64_t write_pre_and_post_process_time;
+  uint64_t write_wal_time;            // total time spent on writing to WAL
+  // total time spent on writing to mem tables
+  uint64_t write_memtable_time;
+};
+
+#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
+extern PerfContext perf_context;
+#else
+extern __thread PerfContext perf_context;
+#endif
+
+}
+
+#endif
--- a/include/rocksdb/slice.h
+++ b/include/rocksdb/slice.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size.  The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a Slice without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Slice must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include <string>
+
+namespace rocksdb {
+
+class Slice {
+ public:
+  // Create an empty slice.
+  Slice() : data_(""), size_(0) { }
+
+  // Create a slice that refers to d[0,n-1].
+  Slice(const char* d, size_t n) : data_(d), size_(n) { }
+
+  // Create a slice that refers to the contents of "s"
+  /* implicit */
+  Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
+
+  // Create a slice that refers to s[0,strlen(s)-1]
+  /* implicit */
+  Slice(const char* s) : data_(s), size_(strlen(s)) { }
+
+  // Return a pointer to the beginning of the referenced data
+  const char* data() const { return data_; }
+
+  // Return the length (in bytes) of the referenced data
+  size_t size() const { return size_; }
+
+  // Return true iff the length of the referenced data is zero
+  bool empty() const { return size_ == 0; }
+
+  // Return the ith byte in the referenced data.
+  // REQUIRES: n < size()
+  char operator[](size_t n) const {
+    assert(n < size());
+    return data_[n];
+  }
+
+  // Change this slice to refer to an empty array
+  void clear() { data_ = ""; size_ = 0; }
+
+  // Drop the first "n" bytes from this slice.
+  void remove_prefix(size_t n) {
+    assert(n <= size());
+    data_ += n;
+    size_ -= n;
+  }
+
+  // Return a string that contains the copy of the referenced data.
+  std::string ToString(bool hex = false) const {
+    if (hex) {
+      std::string result;
+      char buf[10];
+      for (size_t i = 0; i < size_; i++) {
+        snprintf(buf, 10, "%02X", (unsigned char)data_[i]);
+        result += buf;
+      }
+      return result;
+    } else {
+      return std::string(data_, size_);
+    }
+  }
+
+  // Three-way comparison.  Returns value:
+  //   <  0 iff "*this" <  "b",
+  //   == 0 iff "*this" == "b",
+  //   >  0 iff "*this" >  "b"
+  int compare(const Slice& b) const;
+
+  // Return true iff "x" is a prefix of "*this"
+  bool starts_with(const Slice& x) const {
+    return ((size_ >= x.size_) &&
+            (memcmp(data_, x.data_, x.size_) == 0));
+  }
+
+ // private: make these public for rocksdbjni access
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+// A set of Slices that are virtually concatenated together.  'parts' points
+// to an array of Slices.  The number of elements in the array is 'num_parts'.
+struct SliceParts {
+  SliceParts(const Slice* _parts, int _num_parts) :
+      parts(_parts), num_parts(_num_parts) { }
+
+  const Slice* parts;
+  int num_parts;
+};
+
+inline bool operator==(const Slice& x, const Slice& y) {
+  return ((x.size() == y.size()) &&
+          (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) {
+  return !(x == y);
+}
+
+inline int Slice::compare(const Slice& b) const {
+  const int min_len = (size_ < b.size_) ? size_ : b.size_;
+  int r = memcmp(data_, b.data_, min_len);
+  if (r == 0) {
+    if (size_ < b.size_) r = -1;
+    else if (size_ > b.size_) r = +1;
+  }
+  return r;
+}
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_H_
--- a/include/rocksdb/slice_transform.h
+++ b/include/rocksdb/slice_transform.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Class for specifying user-defined functions which perform a
+// transformation on a slice.  It is not required that every slice
+// belong to the domain and/or range of a function.  Subclasses should
+// define InDomain and InRange to determine which slices are in either
+// of these sets respectively.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+class SliceTransform {
+ public:
+  virtual ~SliceTransform() {};
+
+  // Return the name of this transformation.
+  virtual const char* Name() const = 0;
+
+  // transform a src in domain to a dst in the range
+  virtual Slice Transform(const Slice& src) const = 0;
+
+  // determine whether this is a valid src upon the function applies
+  virtual bool InDomain(const Slice& src) const = 0;
+
+  // determine whether dst=Transform(src) for some src
+  virtual bool InRange(const Slice& dst) const = 0;
+};
+
+extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
+
+extern const SliceTransform* NewNoopTransform();
+
+}
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -0,0 +1,268 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
+#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <memory>
+#include <vector>
+
+namespace rocksdb {
+
+/**
+ * Keep adding ticker's here.
+ *  1. Any ticker should be added before TICKER_ENUM_MAX.
+ *  2. Add a readable string in TickersNameMap below for the newly added ticker.
+ */
+enum Tickers {
+  // total block cache misses
+  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+  //                               BLOCK_CACHE_FILTER_MISS +
+  //                               BLOCK_CACHE_DATA_MISS;
+  BLOCK_CACHE_MISS,
+  // total block cache hit
+  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+  //                              BLOCK_CACHE_FILTER_HIT +
+  //                              BLOCK_CACHE_DATA_HIT;
+  BLOCK_CACHE_HIT,
+  // # of blocks added to block cache.
+  BLOCK_CACHE_ADD,
+  // # of times cache miss when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_MISS,
+  // # of times cache hit when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_HIT,
+  // # of times cache miss when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_MISS,
+  // # of times cache hit when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_HIT,
+  // # of times cache miss when accessing data block from block cache.
+  BLOCK_CACHE_DATA_MISS,
+  // # of times cache hit when accessing data block from block cache.
+  BLOCK_CACHE_DATA_HIT,
+  // # of times bloom filter has avoided file reads.
+  BLOOM_FILTER_USEFUL,
+
+  // # of memtable hits.
+  MEMTABLE_HIT,
+  // # of memtable misses.
+  MEMTABLE_MISS,
+
+  /**
+   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+   * There are 3 reasons currently.
+   */
+  COMPACTION_KEY_DROP_NEWER_ENTRY,  // key was written with a newer value.
+  COMPACTION_KEY_DROP_OBSOLETE,     // The key is obsolete.
+  COMPACTION_KEY_DROP_USER,  // user compaction function has dropped the key.
+
+  // Number of keys written to the database via the Put and Write call's
+  NUMBER_KEYS_WRITTEN,
+  // Number of Keys read,
+  NUMBER_KEYS_READ,
+  // Number keys updated, if inplace update is enabled
+  NUMBER_KEYS_UPDATED,
+  // Bytes written / read
+  BYTES_WRITTEN,
+  BYTES_READ,
+  NO_FILE_CLOSES,
+  NO_FILE_OPENS,
+  NO_FILE_ERRORS,
+  // Time system had to wait to do LO-L1 compactions
+  STALL_L0_SLOWDOWN_MICROS,
+  // Time system had to wait to move memtable to L1.
+  STALL_MEMTABLE_COMPACTION_MICROS,
+  // write throttle because of too many files in L0
+  STALL_L0_NUM_FILES_MICROS,
+  RATE_LIMIT_DELAY_MILLIS,
+  NO_ITERATORS,  // number of iterators currently open
+
+  // Number of MultiGet calls, keys read, and bytes read
+  NUMBER_MULTIGET_CALLS,
+  NUMBER_MULTIGET_KEYS_READ,
+  NUMBER_MULTIGET_BYTES_READ,
+
+  // Number of deletes records that were not required to be
+  // written to storage because key does not exist
+  NUMBER_FILTERED_DELETES,
+  NUMBER_MERGE_FAILURES,
+  SEQUENCE_NUMBER,
+
+  // number of times bloom was checked before creating iterator on a
+  // file, and the number of times the check was useful in avoiding
+  // iterator creation (and thus likely IOPs).
+  BLOOM_FILTER_PREFIX_CHECKED,
+  BLOOM_FILTER_PREFIX_USEFUL,
+
+  // Number of times we had to reseek inside an iteration to skip
+  // over large number of keys with same userkey.
+  NUMBER_OF_RESEEKS_IN_ITERATION,
+
+  // Record the number of calls to GetUpadtesSince. Useful to keep track of
+  // transaction log iterator refreshes
+  GET_UPDATES_SINCE_CALLS,
+  BLOCK_CACHE_COMPRESSED_MISS,  // miss in the compressed block cache
+  BLOCK_CACHE_COMPRESSED_HIT,   // hit in the compressed block cache
+  WAL_FILE_SYNCED,              // Number of times WAL sync is done
+  WAL_FILE_BYTES,               // Number of bytes written to WAL
+
+  // Writes can be processed by requesting thread or by the thread at the
+  // head of the writers queue.
+  WRITE_DONE_BY_SELF,
+  WRITE_DONE_BY_OTHER,
+  WRITE_WITH_WAL,       // Number of Write calls that request WAL
+  COMPACT_READ_BYTES,   // Bytes read during compaction
+  COMPACT_WRITE_BYTES,  // Bytes written during compaction
+
+  // Number of table's properties loaded directly from file, without creating
+  // table reader object.
+  NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+  NUMBER_SUPERVERSION_ACQUIRES,
+  NUMBER_SUPERVERSION_RELEASES,
+  NUMBER_SUPERVERSION_CLEANUPS,
+  TICKER_ENUM_MAX
+};
+
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
+const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
+    {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
+    {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
+    {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
+    {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
+    {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
+    {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
+    {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
+    {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
+    {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
+    {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
+    {MEMTABLE_HIT, "rocksdb.memtable.hit"},
+    {MEMTABLE_MISS, "rocksdb.memtable.miss"},
+    {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
+    {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
+    {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
+    {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
+    {NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
+    {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
+    {BYTES_WRITTEN, "rocksdb.bytes.written"},
+    {BYTES_READ, "rocksdb.bytes.read"},
+    {NO_FILE_CLOSES, "rocksdb.no.file.closes"},
+    {NO_FILE_OPENS, "rocksdb.no.file.opens"},
+    {NO_FILE_ERRORS, "rocksdb.no.file.errors"},
+    {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
+    {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
+    {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
+    {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
+    {NO_ITERATORS, "rocksdb.num.iterators"},
+    {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
+    {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
+    {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
+    {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
+    {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
+    {SEQUENCE_NUMBER, "rocksdb.sequence.number"},
+    {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
+    {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
+    {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
+    {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
+    {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
+    {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
+    {WAL_FILE_SYNCED, "rocksdb.wal.synced"},
+    {WAL_FILE_BYTES, "rocksdb.wal.bytes"},
+    {WRITE_DONE_BY_SELF, "rocksdb.write.self"},
+    {WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
+    {WRITE_WITH_WAL, "rocksdb.write.wal"},
+    {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
+    {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
+    {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+     "rocksdb.number.direct.load.table.properties"},
+    {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
+    {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
+    {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
+};
+
+/**
+ * Keep adding histogram's here.
+ * Any histogram whould have value less than HISTOGRAM_ENUM_MAX
+ * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
+ * Add a string representation in HistogramsNameMap below
+ * And increment HISTOGRAM_ENUM_MAX
+ */
+enum Histograms {
+  DB_GET,
+  DB_WRITE,
+  COMPACTION_TIME,
+  TABLE_SYNC_MICROS,
+  COMPACTION_OUTFILE_SYNC_MICROS,
+  WAL_FILE_SYNC_MICROS,
+  MANIFEST_FILE_SYNC_MICROS,
+  // TIME SPENT IN IO DURING TABLE OPEN
+  TABLE_OPEN_IO_MICROS,
+  DB_MULTIGET,
+  READ_BLOCK_COMPACTION_MICROS,
+  READ_BLOCK_GET_MICROS,
+  WRITE_RAW_BLOCK_MICROS,
+
+  STALL_L0_SLOWDOWN_COUNT,
+  STALL_MEMTABLE_COMPACTION_COUNT,
+  STALL_L0_NUM_FILES_COUNT,
+  HARD_RATE_LIMIT_DELAY_COUNT,
+  SOFT_RATE_LIMIT_DELAY_COUNT,
+  NUM_FILES_IN_SINGLE_COMPACTION,
+  HISTOGRAM_ENUM_MAX,
+};
+
+const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
+  { DB_GET, "rocksdb.db.get.micros" },
+  { DB_WRITE, "rocksdb.db.write.micros" },
+  { COMPACTION_TIME, "rocksdb.compaction.times.micros" },
+  { TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" },
+  { COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" },
+  { WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" },
+  { MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" },
+  { TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" },
+  { DB_MULTIGET, "rocksdb.db.multiget.micros" },
+  { READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" },
+  { READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" },
+  { WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" },
+  { STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
+  { STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
+  { STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
+  { HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
+  { SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
+  { NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
+};
+
+struct HistogramData {
+  double median;
+  double percentile95;
+  double percentile99;
+  double average;
+  double standard_deviation;
+};
+
+// Analyze the performance of a db
+class Statistics {
+ public:
+  virtual ~Statistics() {}
+
+  virtual long getTickerCount(Tickers tickerType) = 0;
+  virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;
+  virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
+  virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
+
+  virtual void histogramData(Histograms type, HistogramData* const data) = 0;
+  // String representation of the statistic object.
+  std::string ToString();
+};
+
+// Create a concrete DBStatistics object
+std::shared_ptr<Statistics> CreateDBStatistics();
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation.  It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_
+#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_
+
+#include <string>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class Status {
+ public:
+  // Create a success status.
+  Status() : code_(kOk), state_(nullptr) { }
+  ~Status() { delete[] state_; }
+
+  // Copy the specified status.
+  Status(const Status& s);
+  void operator=(const Status& s);
+
+  // Return a success status.
+  static Status OK() { return Status(); }
+
+  // Return error status of an appropriate type.
+  static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotFound, msg, msg2);
+  }
+  // Fast path for not found without malloc;
+  static Status NotFound() {
+    return Status(kNotFound);
+  }
+  static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kCorruption, msg, msg2);
+  }
+  static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotSupported, msg, msg2);
+  }
+  static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kInvalidArgument, msg, msg2);
+  }
+  static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, msg, msg2);
+  }
+  static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kMergeInProgress, msg, msg2);
+  }
+  static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIncomplete, msg, msg2);
+  }
+  static Status ShutdownInProgress(const Slice& msg,
+                                   const Slice& msg2 = Slice()) {
+    return Status(kShutdownInProgress, msg, msg2);
+  }
+
+  // Returns true iff the status indicates success.
+  bool ok() const { return code() == kOk; }
+
+  // Returns true iff the status indicates a NotFound error.
+  bool IsNotFound() const { return code() == kNotFound; }
+
+  // Returns true iff the status indicates a Corruption error.
+  bool IsCorruption() const { return code() == kCorruption; }
+
+  // Returns true iff the status indicates a NotSupported error.
+  bool IsNotSupported() const { return code() == kNotSupported; }
+
+  // Returns true iff the status indicates an InvalidArgument error.
+  bool IsInvalidArgument() const { return code() == kInvalidArgument; }
+
+  // Returns true iff the status indicates an IOError.
+  bool IsIOError() const { return code() == kIOError; }
+
+  // Returns true iff the status indicates an MergeInProgress.
+  bool IsMergeInProgress() const { return code() == kMergeInProgress; }
+
+  // Returns true iff the status indicates Incomplete
+  bool IsIncomplete() const { return code() == kIncomplete; }
+
+  // Returns true iff the status indicates Incomplete
+  bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }
+
+  // Return a string representation of this status suitable for printing.
+  // Returns the string "OK" for success.
+  std::string ToString() const;
+
+  enum Code {
+    kOk = 0,
+    kNotFound = 1,
+    kCorruption = 2,
+    kNotSupported = 3,
+    kInvalidArgument = 4,
+    kIOError = 5,
+    kMergeInProgress = 6,
+    kIncomplete = 7,
+    kShutdownInProgress = 8
+  };
+
+  Code code() const {
+    return code_;
+  }
+ private:
+  // A nullptr state_ (which is always the case for OK) means the message
+  // is empty.
+  // of the following form:
+  //    state_[0..3] == length of message
+  //    state_[4..]  == message
+  Code code_;
+  const char* state_;
+
+  explicit Status(Code code) : code_(code), state_(nullptr) { }
+  Status(Code code, const Slice& msg, const Slice& msg2);
+  static const char* CopyState(const char* s);
+};
+
+inline Status::Status(const Status& s) {
+  code_ = s.code_;
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+}
+inline void Status::operator=(const Status& s) {
+  // The following condition catches both aliasing (when this == &s),
+  // and the common case where both s and *this are ok.
+  code_ = s.code_;
+  if (state_ != s.state_) {
+    delete[] state_;
+    state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+  }
+}
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_STATUS_H_
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -0,0 +1,206 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Currently we support two types of tables: plain table and block-based table.
+//   1. Block-based table: this is the default table type that we inherited from
+//      LevelDB, which was designed for storing data in hard disk or flash
+//      device.
+//   2. Plain table: it is one of RocksDB's SST file format optimized
+//      for low query latency on pure-memory or really low-latency media.
+//
+// A tutorial of rocksdb table formats is available here:
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
+//
+// Example code is also available
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
+
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+// -- Block-based Table
+class FlushBlockPolicyFactory;
+class RandomAccessFile;
+class TableBuilder;
+class TableReader;
+class WritableFile;
+struct EnvOptions;
+struct Options;
+
+using std::unique_ptr;
+
+enum ChecksumType : char {
+  kNoChecksum = 0x0,  // not yet supported. Will fail
+  kCRC32c = 0x1,
+  kxxHash = 0x2,
+};
+
+// For advanced user only
+struct BlockBasedTableOptions {
+  // @flush_block_policy_factory creates the instances of flush block policy.
+  // which provides a configurable way to determine when to flush a block in
+  // the block based tables.  If not set, table builder will use the default
+  // block flush policy, which cut blocks by block size (please refer to
+  // `FlushBlockBySizePolicy`).
+  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
+
+  // TODO(kailiu) Temporarily disable this feature by making the default value
+  // to be false.
+  //
+  // Indicating if we'd put index/filter blocks to the block cache.
+  // If not specified, each "table reader" object will pre-load index/filter
+  // block during table initialization.
+  bool cache_index_and_filter_blocks = false;
+
+  // The index type that will be used for this table.
+  enum IndexType : char {
+    // A space efficient index block that is optimized for
+    // binary-search-based index.
+    kBinarySearch,
+
+    // The hash index, if enabled, will do the hash lookup when
+    // `Options.prefix_extractor` is provided.
+    kHashSearch,
+  };
+
+  IndexType index_type = kBinarySearch;
+
+  // Use the specified checksum type. Newly created table files will be
+  // protected with this checksum type. Old table files will still be readable,
+  // even though they have different checksum type.
+  ChecksumType checksum = kCRC32c;
+};
+
+// Table Properties that are specific to block-based table properties.
+struct BlockBasedTablePropertyNames {
+  // value of this propertis is a fixed int32 number.
+  static const std::string kIndexType;
+};
+
+// Create default block based table factory.
+extern TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+#ifndef ROCKSDB_LITE
+// -- Plain Table with prefix-only seek
+// For this factory, you need to set Options.prefix_extrator properly to make it
+// work. Look-up will starts with prefix hash lookup for key prefix. Inside the
+// hash bucket found, a binary search is executed for hash conflicts. Finally,
+// a linear search is used.
+// @user_key_len: plain table has optimization for fix-sized keys, which can be
+//                specified via user_key_len.  Alternatively, you can pass
+//                `kPlainTableVariableLength` if your keys have variable
+//                lengths.
+// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You
+//                      may disable it by passing a zero.
+// @hash_table_ratio: the desired utilization of the hash table used for prefix
+//                    hashing. hash_table_ratio = number of prefixes / #buckets
+//                    in the hash table
+// @index_sparseness: inside each prefix, need to build one index record for how
+//                    many keys for binary search inside each hash bucket.
+// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
+
+const uint32_t kPlainTableVariableLength = 0;
+extern TableFactory* NewPlainTableFactory(uint32_t user_key_len =
+                                              kPlainTableVariableLength,
+                                          int bloom_bits_per_prefix = 10,
+                                          double hash_table_ratio = 0.75,
+                                          size_t index_sparseness = 16,
+                                          size_t huge_page_tlb_size = 0);
+
+// -- Plain Table
+// This factory of plain table ignores Options.prefix_extractor and assumes no
+// hashable prefix available to the key structure. Lookup will be based on
+// binary search index only. Total order seek() can be issued.
+// @user_key_len: plain table has optimization for fix-sized keys, which can be
+//                specified via user_key_len.  Alternatively, you can pass
+//                `kPlainTableVariableLength` if your keys have variable
+//                lengths.
+// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may
+//                  disable it by passing a zero.
+// @index_sparseness: need to build one index record for how many keys for
+//                    binary search.
+// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
+extern TableFactory* NewTotalOrderPlainTableFactory(
+    uint32_t user_key_len = kPlainTableVariableLength,
+    int bloom_bits_per_key = 0, size_t index_sparseness = 16,
+    size_t huge_page_tlb_size = 0);
+
+#endif  // ROCKSDB_LITE
+
+// A base class for table factories.
+class TableFactory {
+ public:
+  virtual ~TableFactory() {}
+
+  // The type of the table.
+  //
+  // The client of this package should switch to a new name whenever
+  // the table format implementation changes.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Returns a Table object table that can fetch data from file specified
+  // in parameter file. It's the caller's responsibility to make sure
+  // file is in the correct format.
+  //
+  // NewTableReader() is called in two places:
+  // (1) TableCache::FindTable() calls the function when table cache miss
+  //     and cache the table object returned.
+  // (1) SstFileReader (for SST Dump) opens the table and dump the table
+  //     contents using the interator of the table.
+  // options and soptions are options. options is the general options.
+  // Multiple configured can be accessed from there, including and not
+  // limited to block cache and key comparators.
+  // file is a file handler to handle the file for the table
+  // file_size is the physical file size of the file
+  // table_reader is the output table reader
+  virtual Status NewTableReader(
+      const Options& options, const EnvOptions& soptions,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table_reader) const = 0;
+
+  // Return a table builder to write to a file for this table type.
+  //
+  // It is called in several places:
+  // (1) When flushing memtable to a level-0 output file, it creates a table
+  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
+  // (2) During compaction, it gets the builder for writing compaction output
+  //     files in DBImpl::OpenCompactionOutputFile().
+  // (3) When recovering from transaction logs, it creates a table builder to
+  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
+  //     by calling BuildTable())
+  // (4) When running Repairer, it creates a table builder to convert logs to
+  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
+  //
+  // options is the general options. Multiple configured can be acceseed from
+  // there, including and not limited to compression options.
+  // file is a handle of a writable file. It is the caller's responsibility to
+  // keep the file open and close the file after closing the table builder.
+  // compression_type is the compression type to use in this table.
+  virtual TableBuilder* NewTableBuilder(
+      const Options& options, const InternalKeyComparator& internal_comparator,
+      WritableFile* file, CompressionType compression_type) const = 0;
+};
+
+}  // namespace rocksdb
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <string>
+#include <map>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+// -- Table Properties
+// Other than basic table properties, each table may also have the user
+// collected properties.
+// The value of the user-collected properties are encoded as raw bytes --
+// users have to interprete these values by themselves.
+// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do
+// something similar to:
+//
+// UserCollectedProperties props = ...;
+// for (auto pos = props.lower_bound(prefix);
+//      pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0;
+//      ++pos) {
+//   ...
+// }
+typedef std::map<const std::string, std::string> UserCollectedProperties;
+
+// TableProperties contains a bunch of read-only properties of its associated
+// table.
+struct TableProperties {
+ public:
+  // the total size of all data blocks.
+  uint64_t data_size = 0;
+  // the size of index block.
+  uint64_t index_size = 0;
+  // the size of filter block.
+  uint64_t filter_size = 0;
+  // total raw key size
+  uint64_t raw_key_size = 0;
+  // total raw value size
+  uint64_t raw_value_size = 0;
+  // the number of blocks in this table
+  uint64_t num_data_blocks = 0;
+  // the number of entries in this table
+  uint64_t num_entries = 0;
+  // format version, reserved for backward compatibility
+  uint64_t format_version = 0;
+  // If 0, key is variable length. Otherwise number of bytes for each key.
+  uint64_t fixed_key_len = 0;
+
+  // The name of the filter policy used in this table.
+  // If no filter policy is used, `filter_policy_name` will be an empty string.
+  std::string filter_policy_name;
+
+  // user collected properties
+  UserCollectedProperties user_collected_properties;
+
+  // convert this object to a human readable form
+  //   @prop_delim: delimiter for each property.
+  std::string ToString(const std::string& prop_delim = "; ",
+                       const std::string& kv_delim = "=") const;
+};
+
+// table properties' human-readable names in the property block.
+struct TablePropertiesNames {
+  static const std::string kDataSize;
+  static const std::string kIndexSize;
+  static const std::string kFilterSize;
+  static const std::string kRawKeySize;
+  static const std::string kRawValueSize;
+  static const std::string kNumDataBlocks;
+  static const std::string kNumEntries;
+  static const std::string kFormatVersion;
+  static const std::string kFixedKeyLen;
+  static const std::string kFilterPolicy;
+};
+
+extern const std::string kPropertiesBlock;
+
+// `TablePropertiesCollector` provides the mechanism for users to collect
+// their own interested properties. This class is essentially a collection
+// of callback functions that will be invoked during table building.
+// It is construced with TablePropertiesCollectorFactory. The methods don't
+// need to be thread-safe, as we will create exactly one
+// TablePropertiesCollector object per table and then call it sequentially
+class TablePropertiesCollector {
+ public:
+  virtual ~TablePropertiesCollector() {}
+
+  // Add() will be called when a new key/value pair is inserted into the table.
+  // @params key    the original key that is inserted into the table.
+  // @params value  the original value that is inserted into the table.
+  virtual Status Add(const Slice& key, const Slice& value) = 0;
+
+  // Finish() will be called when a table has already been built and is ready
+  // for writing the properties block.
+  // @params properties  User will add their collected statistics to
+  // `properties`.
+  virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+  // Return the human-readable properties, where the key is property name and
+  // the value is the human-readable form of value.
+  virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+};
+
+// Constructs TablePropertiesCollector. Internals create a new
+// TablePropertiesCollector for each new table
+class TablePropertiesCollectorFactory {
+ public:
+  virtual ~TablePropertiesCollectorFactory() {}
+  // has to be thread-safe
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector() = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+};
+
+// Extra properties
+// Below is a list of non-basic properties that are collected by database
+// itself. Especially some properties regarding to the internal keys (which
+// is unknown to `table`).
+extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
+
+}  // namespace rocksdb
--- a/include/rocksdb/transaction_log.h
+++ b/include/rocksdb/transaction_log.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include <memory>
+#include <vector>
+
+namespace rocksdb {
+
+class LogFile;
+typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
+
+enum  WalFileType {
+  /* Indicates that WAL file is in archive directory. WAL files are moved from
+   * the main db directory to archive directory once they are not live and stay
+   * there until cleaned up. Files are cleaned depending on archive size
+   * (Options::WAL_size_limit_MB) and time since last cleaning
+   * (Options::WAL_ttl_seconds).
+   */
+  kArchivedLogFile = 0,
+
+  /* Indicates that WAL file is live and resides in the main db directory */
+  kAliveLogFile = 1
+} ;
+
+class LogFile {
+ public:
+  LogFile() {}
+  virtual ~LogFile() {}
+
+  // Returns log file's pathname relative to the main db dir
+  // Eg. For a live-log-file = /000003.log
+  //     For an archived-log-file = /archive/000003.log
+  virtual std::string PathName() const = 0;
+
+
+  // Primary identifier for log file.
+  // This is directly proportional to creation time of the log file
+  virtual uint64_t LogNumber() const = 0;
+
+  // Log file can be either alive or archived
+  virtual WalFileType Type() const = 0;
+
+  // Starting sequence number of writebatch written in this log file
+  virtual SequenceNumber StartSequence() const = 0;
+
+  // Size of log file on disk in Bytes
+  virtual uint64_t SizeFileBytes() const = 0;
+};
+
+struct BatchResult {
+  SequenceNumber sequence = 0;
+  std::unique_ptr<WriteBatch> writeBatchPtr;
+};
+
+// A TransactionLogIterator is used to iterate over the transactions in a db.
+// One run of the iterator is continuous, i.e. the iterator will stop at the
+// beginning of any gap in sequences
+class TransactionLogIterator {
+ public:
+  TransactionLogIterator() {}
+  virtual ~TransactionLogIterator() {}
+
+  // An iterator is either positioned at a WriteBatch or not valid.
+  // This method returns true if the iterator is valid.
+  // Can read data from a valid iterator.
+  virtual bool Valid() = 0;
+
+  // Moves the iterator to the next WriteBatch.
+  // REQUIRES: Valid() to be true.
+  virtual void Next() = 0;
+
+  // Returns ok if the iterator is valid.
+  // Returns the Error when something has gone wrong.
+  virtual Status status() = 0;
+
+  // If valid return's the current write_batch and the sequence number of the
+  // earliest transaction contained in the batch.
+  // ONLY use if Valid() is true and status() is OK.
+  virtual BatchResult GetBatch() = 0;
+
+  // The read options for TransactionLogIterator.
+  struct ReadOptions {
+    // If true, all data read from underlying storage will be
+    // verified against corresponding checksums.
+    // Default: true
+    bool verify_checksums_;
+
+    ReadOptions() : verify_checksums_(true) {}
+
+    explicit ReadOptions(bool verify_checksums)
+        : verify_checksums_(verify_checksums) {}
+  };
+};
+} //  namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
--- a/include/rocksdb/types.h
+++ b/include/rocksdb/types.h
@@ -0,0 +1,20 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+
+#include <stdint.h>
+
+namespace rocksdb {
+
+// Define all public custom types here.
+
+// Represents a sequence number in a WAL file.
+typedef uint64_t SequenceNumber;
+
+}  //  namespace rocksdb
+
+#endif //  STORAGE_ROCKSDB_INCLUDE_TYPES_H_
--- a/include/rocksdb/universal_compaction.h
+++ b/include/rocksdb/universal_compaction.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
+#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
+
+#include <stdint.h>
+#include <climits>
+
+namespace rocksdb {
+
+//
+// Algorithm used to make a compaction request stop picking new files
+// into a single compaction run
+//
+enum CompactionStopStyle {
+  kCompactionStopStyleSimilarSize, // pick files of similar size
+  kCompactionStopStyleTotalSize    // total size of picked files > next file
+};
+
+class CompactionOptionsUniversal {
+ public:
+
+  // Percentage flexibilty while comparing file size. If the candidate file(s)
+  // size is 1% smaller than the next file's size, then include next file into
+  // this candidate set. // Default: 1
+  unsigned int size_ratio;
+
+  // The minimum number of files in a single compaction run. Default: 2
+  unsigned int min_merge_width;
+
+  // The maximum number of files in a single compaction run. Default: UINT_MAX
+  unsigned int max_merge_width;
+
+  // The size amplification is defined as the amount (in percentage) of
+  // additional storage needed to store a single byte of data in the database.
+  // For example, a size amplification of 2% means that a database that
+  // contains 100 bytes of user-data may occupy upto 102 bytes of
+  // physical storage. By this definition, a fully compacted database has
+  // a size amplification of 0%. Rocksdb uses the following heuristic
+  // to calculate size amplification: it assumes that all files excluding
+  // the earliest file contribute to the size amplification.
+  // Default: 200, which means that a 100 byte database could require upto
+  // 300 bytes of storage.
+  unsigned int max_size_amplification_percent;
+
+  // If this option is set to be -1 (the default value), all the output files
+  // will follow compression type specified.
+  //
+  // If this option is not negative, we will try to make sure compressed
+  // size is just above this value. In normal cases, at least this percentage
+  // of data will be compressed.
+  // When we are compacting to a new file, here is the criteria whether
+  // it needs to be compressed: assuming here are the list of files sorted
+  // by generation time:
+  //    A1...An B1...Bm C1...Ct
+  // where A1 is the newest and Ct is the oldest, and we are going to compact
+  // B1...Bm, we calculate the total size of all the files as total_size, as
+  // well as  the total size of C1...Ct as total_C, the compaction output file
+  // will be compressed iff
+  //   total_C / total_size < this percentage
+  int compression_size_percent;
+
+  // The algorithm used to stop picking files into a single compaction run
+  // Default: kCompactionStopStyleTotalSize
+  CompactionStopStyle stop_style;
+
+  // Default set of parameters
+  CompactionOptionsUniversal() :
+    size_ratio(1),
+    min_merge_width(2),
+    max_merge_width(UINT_MAX),
+    max_size_amplification_percent(200),
+    compression_size_percent(-1),
+    stop_style(kCompactionStopStyleTotalSize) {
+  }
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+// Also update Makefile if you change these
+#define ROCKSDB_MAJOR 3
+#define ROCKSDB_MINOR 2
+#define ROCKSDB_PATCH 0
+
+// Do not use these. We made the mistake of declaring macros starting with
+// double underscore. Now we have to live with our choice. We'll deprecate these
+// at some point
+#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
+#define __ROCKSDB_MINOR__ ROCKSDB_MINOR
+#define __ROCKSDB_PATCH__ ROCKSDB_PATCH
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -0,0 +1,158 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch.  For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+//    batch.Put("key", "v1");
+//    batch.Delete("key");
+//    batch.Put("key", "v2");
+//    batch.Put("key", "v3");
+//
+// Multiple threads can invoke const methods on a WriteBatch without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same WriteBatch must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
+#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
+
+#include <string>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Slice;
+class ColumnFamilyHandle;
+struct SliceParts;
+
+class WriteBatch {
+ public:
+  explicit WriteBatch(size_t reserved_bytes = 0);
+  ~WriteBatch();
+
+  // Store the mapping "key->value" in the database.
+  void Put(ColumnFamilyHandle* column_family, const Slice& key,
+           const Slice& value);
+  void Put(const Slice& key, const Slice& value) {
+    Put(nullptr, key, value);
+  }
+
+  // Variant of Put() that gathers output like writev(2).  The key and value
+  // that will be written to the database are concatentations of arrays of
+  // slices.
+  void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+           const SliceParts& value);
+  void Put(const SliceParts& key, const SliceParts& value) {
+    Put(nullptr, key, value);
+  }
+
+  // Merge "value" with the existing value of "key" in the database.
+  // "key->merge(existing, value)"
+  void Merge(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value);
+  void Merge(const Slice& key, const Slice& value) {
+    Merge(nullptr, key, value);
+  }
+
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  void Delete(ColumnFamilyHandle* column_family, const Slice& key);
+  void Delete(const Slice& key) { Delete(nullptr, key); }
+
+  // Append a blob of arbitrary size to the records in this batch. The blob will
+  // be stored in the transaction log but not in any other file. In particular,
+  // it will not be persisted to the SST files. When iterating over this
+  // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+  // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+  // encountered in the same order in thich they were inserted. The blob will
+  // NOT consume sequence number(s) and will NOT increase the count of the batch
+  //
+  // Example application: add timestamps to the transaction log for use in
+  // replication.
+  void PutLogData(const Slice& blob);
+
+  // Clear all updates buffered in this batch.
+  void Clear();
+
+  // Support for iterating over the contents of a batch.
+  class Handler {
+   public:
+    virtual ~Handler();
+    // default implementation will just call Put without column family for
+    // backwards compatibility. If the column family is not default,
+    // the function is noop
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) {
+      if (column_family_id == 0) {
+        // Put() historically doesn't return status. We didn't want to be
+        // backwards incompatible so we didn't change the return status
+        // (this is a public API). We do an ordinary get and return Status::OK()
+        Put(key, value);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and PutCF not implemented");
+    }
+    virtual void Put(const Slice& key, const Slice& value);
+    // Merge and LogData are not pure virtual. Otherwise, we would break
+    // existing clients of Handler on a source code level. The default
+    // implementation of Merge simply throws a runtime exception.
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                           const Slice& value) {
+      if (column_family_id == 0) {
+        Merge(key, value);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and MergeCF not implemented");
+    }
+    virtual void Merge(const Slice& key, const Slice& value);
+    // The default implementation of LogData does nothing.
+    virtual void LogData(const Slice& blob);
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+      if (column_family_id == 0) {
+        Delete(key);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and DeleteCF not implemented");
+    }
+    virtual void Delete(const Slice& key);
+    // Continue is called by WriteBatch::Iterate. If it returns false,
+    // iteration is halted. Otherwise, it continues iterating. The default
+    // implementation always returns true.
+    virtual bool Continue();
+  };
+  Status Iterate(Handler* handler) const;
+
+  // Retrieve the serialized version of this batch.
+  const std::string& Data() const { return rep_; }
+
+  // Retrieve data size of the batch.
+  size_t GetDataSize() const { return rep_.size(); }
+
+  // Returns the number of updates in the batch
+  int Count() const;
+
+  // Constructor with a serialized string object
+  explicit WriteBatch(std::string rep): rep_(rep) {}
+
+ private:
+  friend class WriteBatchInternal;
+
+  std::string rep_;  // See comment in write_batch.cc for the format of rep_
+
+  // Intentionally copyable
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
--- a/include/utilities/backupable_db.h
+++ b/include/utilities/backupable_db.h
@@ -0,0 +1,251 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <string>
+#include <map>
+#include <vector>
+
+#include "utilities/stackable_db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+struct BackupableDBOptions {
+  // Where to keep the backup files. Has to be different than dbname_
+  // Best to set this to dbname_ + "/backups"
+  // Required
+  std::string backup_dir;
+
+  // Backup Env object. It will be used for backup file I/O. If it's
+  // nullptr, backups will be written out using DBs Env. If it's
+  // non-nullptr, backup's I/O will be performed using this object.
+  // If you want to have backups on HDFS, use HDFS Env here!
+  // Default: nullptr
+  Env* backup_env;
+
+  // If share_table_files == true, backup will assume that table files with
+  // same name have the same contents. This enables incremental backups and
+  // avoids unnecessary data copies.
+  // If share_table_files == false, each backup will be on its own and will
+  // not share any data with other backups.
+  // default: true
+  bool share_table_files;
+
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  // If sync == true, we can guarantee you'll get consistent backup even
+  // on a machine crash/reboot. Backup process is slower with sync enabled.
+  // If sync == false, we don't guarantee anything on machine reboot. However,
+  // chances are some of the backups are consistent.
+  // Default: true
+  bool sync;
+
+  // If true, it will delete whatever backups there are already
+  // Default: false
+  bool destroy_old_data;
+
+  // If false, we won't backup log files. This option can be useful for backing
+  // up in-memory databases where log file are persisted, but table files are in
+  // memory.
+  // Default: true
+  bool backup_log_files;
+
+  // Max bytes that can be transferred in a second during backup.
+  // If 0, go as fast as you can
+  // Default: 0
+  uint64_t backup_rate_limit;
+
+  // Max bytes that can be transferred in a second during restore.
+  // If 0, go as fast as you can
+  // Default: 0
+  uint64_t restore_rate_limit;
+
+  // Only used if share_table_files is set to true. If true, will consider that
+  // backups can come from different databases, hence a sst is not uniquely
+  // identifed by its name, but by the triple (file name, crc32, file length)
+  // Default: false
+  // Note: this is an experimental option, and you'll need to set it manually
+  // *turn it on only if you know what you're doing*
+  bool share_files_with_checksum;
+
+  void Dump(Logger* logger) const;
+
+  explicit BackupableDBOptions(const std::string& _backup_dir,
+                               Env* _backup_env = nullptr,
+                               bool _share_table_files = true,
+                               Logger* _info_log = nullptr, bool _sync = true,
+                               bool _destroy_old_data = false,
+                               bool _backup_log_files = true,
+                               uint64_t _backup_rate_limit = 0,
+                               uint64_t _restore_rate_limit = 0)
+      : backup_dir(_backup_dir),
+        backup_env(_backup_env),
+        share_table_files(_share_table_files),
+        info_log(_info_log),
+        sync(_sync),
+        destroy_old_data(_destroy_old_data),
+        backup_log_files(_backup_log_files),
+        backup_rate_limit(_backup_rate_limit),
+        restore_rate_limit(_restore_rate_limit),
+        share_files_with_checksum(false) {
+    assert(share_table_files || !share_files_with_checksum);
+  }
+};
+
+struct RestoreOptions {
+  // If true, restore won't overwrite the existing log files in wal_dir. It will
+  // also move all log files from archive directory to wal_dir. Use this option
+  // in combination with BackupableDBOptions::backup_log_files = false for
+  // persisting in-memory databases.
+  // Default: false
+  bool keep_log_files;
+
+  explicit RestoreOptions(bool _keep_log_files = false)
+      : keep_log_files(_keep_log_files) {}
+};
+
+typedef uint32_t BackupID;
+
+struct BackupInfo {
+  BackupID backup_id;
+  int64_t timestamp;
+  uint64_t size;
+
+  BackupInfo() {}
+  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size)
+      : backup_id(_backup_id), timestamp(_timestamp), size(_size) {}
+};
+
+class BackupEngineReadOnly {
+ public:
+  virtual ~BackupEngineReadOnly() {}
+
+  static BackupEngineReadOnly* NewReadOnlyBackupEngine(
+      Env* db_env, const BackupableDBOptions& options);
+
+  // You can GetBackupInfo safely, even with other BackupEngine performing
+  // backups on the same directory
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
+
+  // Restoring DB from backup is NOT safe when there is another BackupEngine
+  // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's
+  // responsibility to synchronize the operation, i.e. don't delete the backup
+  // when you're restoring from it
+  virtual Status RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+  virtual Status RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+};
+
+// Please see the documentation in BackupableDB and RestoreBackupableDB
+class BackupEngine {
+ public:
+  virtual ~BackupEngine() {}
+
+  static BackupEngine* NewBackupEngine(Env* db_env,
+                                       const BackupableDBOptions& options);
+
+  virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0;
+  virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
+  virtual Status DeleteBackup(BackupID backup_id) = 0;
+  virtual void StopBackup() = 0;
+
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
+  virtual Status RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+  virtual Status RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& restore_options = RestoreOptions()) = 0;
+};
+
+// Stack your DB with BackupableDB to be able to backup the DB
+class BackupableDB : public StackableDB {
+ public:
+  // BackupableDBOptions have to be the same as the ones used in a previous
+  // incarnation of the DB
+  //
+  // BackupableDB ownes the pointer `DB* db` now. You should not delete it or
+  // use it after the invocation of BackupableDB
+  BackupableDB(DB* db, const BackupableDBOptions& options);
+  virtual ~BackupableDB();
+
+  // Captures the state of the database in the latest backup
+  // NOT a thread safe call
+  Status CreateNewBackup(bool flush_before_backup = false);
+  // Returns info about backups in backup_info
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+  // deletes old backups, keeping latest num_backups_to_keep alive
+  Status PurgeOldBackups(uint32_t num_backups_to_keep);
+  // deletes a specific backup
+  Status DeleteBackup(BackupID backup_id);
+  // Call this from another thread if you want to stop the backup
+  // that is currently happening. It will return immediatelly, will
+  // not wait for the backup to stop.
+  // The backup will stop ASAP and the call to CreateNewBackup will
+  // return Status::Incomplete(). It will not clean up after itself, but
+  // the state will remain consistent. The state will be cleaned up
+  // next time you create BackupableDB or RestoreBackupableDB.
+  void StopBackup();
+
+ private:
+  BackupEngine* backup_engine_;
+};
+
+// Use this class to access information about backups and restore from them
+class RestoreBackupableDB {
+ public:
+  RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options);
+  ~RestoreBackupableDB();
+
+  // Returns info about backups in backup_info
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+
+  // restore from backup with backup_id
+  // IMPORTANT -- if options_.share_table_files == true and you restore DB
+  // from some backup that is not the latest, and you start creating new
+  // backups from the new DB, they will probably fail
+  //
+  // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
+  // If you add new data to the DB and try creating a new backup now, the
+  // database will diverge from backups 4 and 5 and the new backup will fail.
+  // If you want to create new backup, you will first have to delete backups 4
+  // and 5.
+  Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
+                             const std::string& wal_dir,
+                             const RestoreOptions& restore_options =
+                                 RestoreOptions());
+
+  // restore from the latest backup
+  Status RestoreDBFromLatestBackup(const std::string& db_dir,
+                                   const std::string& wal_dir,
+                                   const RestoreOptions& restore_options =
+                                       RestoreOptions());
+  // deletes old backups, keeping latest num_backups_to_keep alive
+  Status PurgeOldBackups(uint32_t num_backups_to_keep);
+  // deletes a specific backup
+  Status DeleteBackup(BackupID backup_id);
+
+ private:
+  BackupEngine* backup_engine_;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
--- a/include/utilities/db_ttl.h
+++ b/include/utilities/db_ttl.h
@@ -0,0 +1,68 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "utilities/stackable_db.h"
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// Database with TTL support.
+//
+// USE-CASES:
+// This API should be used to open the db when key-values inserted are
+//  meant to be removed from the db in a non-strict 'ttl' amount of time
+//  Therefore, this guarantees that key-values inserted will remain in the
+//  db for >= ttl amount of time and the db will make efforts to remove the
+//  key-values as soon as possible after ttl seconds of their insertion.
+//
+// BEHAVIOUR:
+// TTL is accepted in seconds
+// (int32_t)Timestamp(creation) is suffixed to values in Put internally
+// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
+// Get/Iterator may return expired entries(compaction not run on them yet)
+// Different TTL may be used during different Opens
+// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
+//          Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
+// read_only=true opens in the usual read-only mode. Compactions will not be
+//  triggered(neither manual nor automatic), so no expired entries removed
+//
+// CONSTRAINTS:
+// Not specifying/passing or non-positive TTL behaves like TTL = infinity
+//
+// !!!WARNING!!!:
+// Calling DB::Open directly to re-open a db created by this API will get
+//  corrupt values(timestamp suffixed) and no ttl effect will be there
+//  during the second Open, so use this API consistently to open the db
+// Be careful when passing ttl with a small positive value because the
+//  whole database may be deleted in a small amount of time
+
+class DBWithTTL : public StackableDB {
+ public:
+  virtual Status CreateColumnFamilyWithTtl(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      ColumnFamilyHandle** handle, int ttl) = 0;
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DBWithTTL** dbptr, int32_t ttl = 0,
+                     bool read_only = false);
+
+  static Status Open(const DBOptions& db_options, const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     DBWithTTL** dbptr, std::vector<int32_t> ttls,
+                     bool read_only = false);
+
+ protected:
+  explicit DBWithTTL(DB* db) : StackableDB(db) {}
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
--- a/include/utilities/geo_db.h
+++ b/include/utilities/geo_db.h
@@ -0,0 +1,105 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#ifndef ROCKSDB_LITE
+#pragma once
+#include <string>
+#include <vector>
+
+#include "utilities/stackable_db.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+//
+// Configurable options needed for setting up a Geo database
+//
+struct GeoDBOptions {
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  explicit GeoDBOptions(Logger* _info_log = nullptr):info_log(_info_log) { }
+};
+
+//
+// A position in the earth's geoid
+//
+class GeoPosition {
+ public:
+  double latitude;
+  double longitude;
+
+  explicit GeoPosition(double la = 0, double lo = 0) :
+    latitude(la), longitude(lo) {
+  }
+};
+
+//
+// Description of an object on the Geoid. It is located by a GPS location,
+// and is identified by the id. The value associated with this object is
+// an opaque string 'value'. Different objects identified by unique id's
+// can have the same gps-location associated with them.
+//
+class GeoObject {
+ public:
+  GeoPosition position;
+  std::string id;
+  std::string value;
+
+  GeoObject() {}
+
+  GeoObject(const GeoPosition& pos, const std::string& i,
+            const std::string& val) :
+    position(pos), id(i), value(val) {
+  }
+};
+
+//
+// Stack your DB with GeoDB to be able to get geo-spatial support
+//
+class GeoDB : public StackableDB {
+ public:
+  // GeoDBOptions have to be the same as the ones used in a previous
+  // incarnation of the DB
+  //
+  // GeoDB owns the pointer `DB* db` now. You should not delete it or
+  // use it after the invocation of GeoDB
+  // GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
+  GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
+  virtual ~GeoDB() {}
+
+  // Insert a new object into the location database. The object is
+  // uniquely identified by the id. If an object with the same id already
+  // exists in the db, then the old one is overwritten by the new
+  // object being inserted here.
+  virtual Status Insert(const GeoObject& object) = 0;
+
+  // Retrieve the value of the object located at the specified GPS
+  // location and is identified by the 'id'.
+  virtual Status GetByPosition(const GeoPosition& pos,
+                               const Slice& id, std::string* value) = 0;
+
+  // Retrieve the value of the object identified by the 'id'. This method
+  // could be potentially slower than GetByPosition
+  virtual Status GetById(const Slice& id, GeoObject*  object) = 0;
+
+  // Delete the specified object
+  virtual Status Remove(const Slice& id) = 0;
+
+  // Returns a list of all items within a circular radius from the
+  // specified gps location. If 'number_of_values' is specified,
+  // then this call returns at most that many number of objects.
+  // The radius is specified in 'meters'.
+  virtual Status SearchRadial(const GeoPosition& pos,
+                              double radius,
+                              std::vector<GeoObject>* values,
+                              int number_of_values = INT_MAX) = 0;
+};
+
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
--- a/include/utilities/stackable_db.h
+++ b/include/utilities/stackable_db.h
@@ -0,0 +1,215 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
+class StackableDB : public DB {
+ public:
+  // StackableDB is the owner of db now!
+  explicit StackableDB(DB* db) : db_(db) {}
+
+  ~StackableDB() {
+    delete db_;
+  }
+
+  virtual DB* GetBaseDB() {
+    return db_;
+  }
+
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle) {
+    return db_->CreateColumnFamily(options, column_family_name, handle);
+  }
+
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) {
+    return db_->DropColumnFamily(column_family);
+  }
+
+  using DB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& val) override {
+    return db_->Put(options, column_family, key, val);
+  }
+
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) override {
+    return db_->Get(options, column_family, key, value);
+  }
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
+    return db_->MultiGet(options, column_family, keys, values);
+  }
+
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override {
+    return db_->KeyMayExist(options, column_family, key, value, value_found);
+  }
+
+  using DB::Delete;
+  virtual Status Delete(const WriteOptions& wopts,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
+    return db_->Delete(wopts, column_family, key);
+  }
+
+  using DB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override {
+    return db_->Merge(options, column_family, key, value);
+  }
+
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
+    override {
+      return db_->Write(opts, updates);
+  }
+
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& opts,
+                                ColumnFamilyHandle* column_family) override {
+    return db_->NewIterator(opts, column_family);
+  }
+
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) {
+    return db_->NewIterators(options, column_families, iterators);
+  }
+
+
+  virtual const Snapshot* GetSnapshot() override {
+    return db_->GetSnapshot();
+  }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+    return db_->ReleaseSnapshot(snapshot);
+  }
+
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) override {
+      return db_->GetProperty(column_family, property, value);
+  }
+
+  using DB::GetApproximateSizes;
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                   const Range* r, int n,
+                                   uint64_t* sizes) override {
+      return db_->GetApproximateSizes(column_family, r, n, sizes);
+  }
+
+  using DB::CompactRange;
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false,
+                              int target_level = -1) override {
+    return db_->CompactRange(column_family, begin, end, reduce_level,
+                             target_level);
+  }
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
+    return db_->NumberLevels(column_family);
+  }
+
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family)
+      override {
+    return db_->MaxMemCompactionLevel(column_family);
+  }
+
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family)
+      override {
+    return db_->Level0StopWriteTrigger(column_family);
+  }
+
+  virtual const std::string& GetName() const override {
+    return db_->GetName();
+  }
+
+  virtual Env* GetEnv() const override {
+    return db_->GetEnv();
+  }
+
+  using DB::GetOptions;
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
+      override {
+    return db_->GetOptions(column_family);
+  }
+
+  using DB::Flush;
+  virtual Status Flush(const FlushOptions& fopts,
+                       ColumnFamilyHandle* column_family) override {
+    return db_->Flush(fopts, column_family);
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return db_->DisableFileDeletions();
+  }
+
+  virtual Status EnableFileDeletions(bool force) override {
+    return db_->EnableFileDeletions(force);
+  }
+
+  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+                              bool flush_memtable = true) override {
+      return db_->GetLiveFiles(vec, mfs, flush_memtable);
+  }
+
+  virtual SequenceNumber GetLatestSequenceNumber() const override {
+    return db_->GetLatestSequenceNumber();
+  }
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+    return db_->GetSortedWalFiles(files);
+  }
+
+  virtual Status DeleteFile(std::string name) override {
+    return db_->DeleteFile(name);
+  }
+
+  virtual Status GetDbIdentity(std::string& identity) {
+    return db_->GetDbIdentity(identity);
+  }
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                          TablePropertiesCollection* props) {
+    return db_->GetPropertiesOfAllTables(column_family, props);
+  }
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options) override {
+    return db_->GetUpdatesSince(seq_number, iter, read_options);
+  }
+
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
+    return db_->DefaultColumnFamily();
+  }
+
+ protected:
+  DB* db_;
+};
+
+} //  namespace rocksdb
--- a/include/utilities/utility_db.h
+++ b/include/utilities/utility_db.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <vector>
+#include <string>
+
+#include "utilities/stackable_db.h"
+#include "utilities/db_ttl.h"
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+// Please don't use this class. It's deprecated
+class UtilityDB {
+ public:
+  // This function is here only for backwards compatibility. Please use the
+  // functions defined in DBWithTTl (utilities/db_ttl.h)
+  // (deprecated)
+  __attribute__((deprecated)) static Status OpenTtlDB(const Options& options,
+                                                      const std::string& name,
+                                                      StackableDB** dbptr,
+                                                      int32_t ttl = 0,
+                                                      bool read_only = false);
+};
+
+} //  namespace rocksdb
+#endif  // ROCKSDB_LITE