Squashed 'src/rocksdb/' content from commit 224932d

git-subtree-dir: src/rocksdb
git-subtree-split: 224932d4d0b561712107d747c662df181c39644d
This commit is contained in:
Vinnie Falco
2014-08-08 11:57:41 -07:00
commit f86d9fd626
435 changed files with 123706 additions and 0 deletions

788
include/rocksdb/c.h Normal file
View File

@@ -0,0 +1,788 @@
/* Copyright (c) 2013, Facebook, Inc. All rights reserved.
This source code is licensed under the BSD-style license found in the
LICENSE file in the root directory of this source tree. An additional grant
of patent rights can be found in the PATENTS file in the same directory.
Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file. See the AUTHORS file for names of contributors.
C bindings for rocksdb. May be useful as a stable ABI that can be
used by programs that keep rocksdb in a shared library, or for
a JNI api.
Does not support:
. getters for the option types
. custom comparators that implement key shortening
. capturing post-write-snapshot
. custom iter, db, env, cache implementations using just the C bindings
Some conventions:
(1) We expose just opaque struct pointers and functions to clients.
This allows us to change internal representations without having to
recompile clients.
(2) For simplicity, there is no equivalent to the Slice type. Instead,
the caller has to pass the pointer and length as separate
arguments.
(3) Errors are represented by a null-terminated c string. NULL
means no error. All operations that can raise an error are passed
a "char** errptr" as the last argument. One of the following must
be true on entry:
*errptr == NULL
*errptr points to a malloc()ed null-terminated error message
On success, a leveldb routine leaves *errptr unchanged.
On failure, leveldb frees the old value of *errptr and
set *errptr to a malloc()ed error message.
(4) Bools have the type unsigned char (0 == false; rest == true)
(5) All of the pointer arguments must be non-NULL.
*/
#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
#define STORAGE_ROCKSDB_INCLUDE_C_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
/* Exported types */
typedef struct rocksdb_t rocksdb_t;
typedef struct rocksdb_cache_t rocksdb_cache_t;
typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
typedef struct rocksdb_compactionfiltercontext_t
rocksdb_compactionfiltercontext_t;
typedef struct rocksdb_compactionfilterfactory_t
rocksdb_compactionfilterfactory_t;
typedef struct rocksdb_compactionfilterv2_t
rocksdb_compactionfilterv2_t;
typedef struct rocksdb_compactionfilterfactoryv2_t
rocksdb_compactionfilterfactoryv2_t;
typedef struct rocksdb_comparator_t rocksdb_comparator_t;
typedef struct rocksdb_env_t rocksdb_env_t;
typedef struct rocksdb_fifo_compaction_options_t rocksdb_fifo_compaction_options_t;
typedef struct rocksdb_filelock_t rocksdb_filelock_t;
typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t;
typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t;
typedef struct rocksdb_iterator_t rocksdb_iterator_t;
typedef struct rocksdb_logger_t rocksdb_logger_t;
typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t;
typedef struct rocksdb_options_t rocksdb_options_t;
typedef struct rocksdb_randomfile_t rocksdb_randomfile_t;
typedef struct rocksdb_readoptions_t rocksdb_readoptions_t;
typedef struct rocksdb_seqfile_t rocksdb_seqfile_t;
typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t;
typedef struct rocksdb_snapshot_t rocksdb_snapshot_t;
typedef struct rocksdb_writablefile_t rocksdb_writablefile_t;
typedef struct rocksdb_writebatch_t rocksdb_writebatch_t;
typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t;
typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t;
typedef struct rocksdb_livefiles_t rocksdb_livefiles_t;
typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t;
/* DB operations */
extern rocksdb_t* rocksdb_open(
const rocksdb_options_t* options,
const char* name,
char** errptr);
extern rocksdb_t* rocksdb_open_for_read_only(
const rocksdb_options_t* options,
const char* name,
unsigned char error_if_log_file_exist,
char** errptr);
extern rocksdb_t* rocksdb_open_column_families(
const rocksdb_options_t* options,
const char* name,
int num_column_families,
const char** column_family_names,
const rocksdb_options_t** column_family_options,
rocksdb_column_family_handle_t** column_family_handles,
char** errptr);
extern rocksdb_t* rocksdb_open_for_read_only_column_families(
const rocksdb_options_t* options,
const char* name,
int num_column_families,
const char** column_family_names,
const rocksdb_options_t** column_family_options,
rocksdb_column_family_handle_t** column_family_handles,
unsigned char error_if_log_file_exist,
char** errptr);
char** rocksdb_list_column_families(
const rocksdb_options_t* options,
const char* name,
size_t* lencf,
char** errptr);
void rocksdb_list_column_families_destroy(char** list, size_t len);
extern rocksdb_column_family_handle_t* rocksdb_create_column_family(
rocksdb_t* db,
const rocksdb_options_t* column_family_options,
const char* column_family_name,
char** errptr);
extern void rocksdb_drop_column_family(
rocksdb_t* db,
rocksdb_column_family_handle_t* handle,
char** errptr);
extern void rocksdb_column_family_handle_destroy(rocksdb_column_family_handle_t*);
extern void rocksdb_close(rocksdb_t* db);
extern void rocksdb_put(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr);
extern void rocksdb_put_cf(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr);
extern void rocksdb_delete(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
const char* key, size_t keylen,
char** errptr);
void rocksdb_delete_cf(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen,
char** errptr);
extern void rocksdb_merge(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr);
extern void rocksdb_merge_cf(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr);
extern void rocksdb_write(
rocksdb_t* db,
const rocksdb_writeoptions_t* options,
rocksdb_writebatch_t* batch,
char** errptr);
/* Returns NULL if not found. A malloc()ed array otherwise.
Stores the length of the array in *vallen. */
extern char* rocksdb_get(
rocksdb_t* db,
const rocksdb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr);
extern char* rocksdb_get_cf(
rocksdb_t* db,
const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t keylen,
size_t* vallen,
char** errptr);
extern rocksdb_iterator_t* rocksdb_create_iterator(
rocksdb_t* db,
const rocksdb_readoptions_t* options);
extern rocksdb_iterator_t* rocksdb_create_iterator_cf(
rocksdb_t* db,
const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family);
extern const rocksdb_snapshot_t* rocksdb_create_snapshot(
rocksdb_t* db);
extern void rocksdb_release_snapshot(
rocksdb_t* db,
const rocksdb_snapshot_t* snapshot);
/* Returns NULL if property name is unknown.
Else returns a pointer to a malloc()-ed null-terminated value. */
extern char* rocksdb_property_value(
rocksdb_t* db,
const char* propname);
extern char* rocksdb_property_value_cf(
rocksdb_t* db,
rocksdb_column_family_handle_t* column_family,
const char* propname);
extern void rocksdb_approximate_sizes(
rocksdb_t* db,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes);
extern void rocksdb_approximate_sizes_cf(
rocksdb_t* db,
rocksdb_column_family_handle_t* column_family,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes);
extern void rocksdb_compact_range(
rocksdb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len);
extern void rocksdb_compact_range_cf(
rocksdb_t* db,
rocksdb_column_family_handle_t* column_family,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len);
extern void rocksdb_delete_file(
rocksdb_t* db,
const char* name);
extern const rocksdb_livefiles_t* rocksdb_livefiles(
rocksdb_t* db);
extern void rocksdb_flush(
rocksdb_t* db,
const rocksdb_flushoptions_t* options,
char** errptr);
extern void rocksdb_disable_file_deletions(
rocksdb_t* db,
char** errptr);
extern void rocksdb_enable_file_deletions(
rocksdb_t* db,
unsigned char force,
char** errptr);
/* Management operations */
extern void rocksdb_destroy_db(
const rocksdb_options_t* options,
const char* name,
char** errptr);
extern void rocksdb_repair_db(
const rocksdb_options_t* options,
const char* name,
char** errptr);
/* Iterator */
extern void rocksdb_iter_destroy(rocksdb_iterator_t*);
extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*);
extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen);
extern void rocksdb_iter_next(rocksdb_iterator_t*);
extern void rocksdb_iter_prev(rocksdb_iterator_t*);
extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen);
extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen);
extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr);
/* Write batch */
extern rocksdb_writebatch_t* rocksdb_writebatch_create();
extern rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
size_t size);
extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*);
extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
extern int rocksdb_writebatch_count(rocksdb_writebatch_t*);
extern void rocksdb_writebatch_put(
rocksdb_writebatch_t*,
const char* key, size_t klen,
const char* val, size_t vlen);
extern void rocksdb_writebatch_put_cf(
rocksdb_writebatch_t*,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen,
const char* val, size_t vlen);
extern void rocksdb_writebatch_merge(
rocksdb_writebatch_t*,
const char* key, size_t klen,
const char* val, size_t vlen);
extern void rocksdb_writebatch_merge_cf(
rocksdb_writebatch_t*,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen,
const char* val, size_t vlen);
extern void rocksdb_writebatch_delete(
rocksdb_writebatch_t*,
const char* key, size_t klen);
extern void rocksdb_writebatch_delete_cf(
rocksdb_writebatch_t*,
rocksdb_column_family_handle_t* column_family,
const char* key, size_t klen);
extern void rocksdb_writebatch_iterate(
rocksdb_writebatch_t*,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen));
extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size);
/* Options */
extern rocksdb_options_t* rocksdb_options_create();
extern void rocksdb_options_destroy(rocksdb_options_t*);
extern void rocksdb_options_increase_parallelism(
rocksdb_options_t* opt, int total_threads);
extern void rocksdb_options_optimize_for_point_lookup(
rocksdb_options_t* opt);
extern void rocksdb_options_optimize_level_style_compaction(
rocksdb_options_t* opt, uint64_t memtable_memory_budget);
extern void rocksdb_options_optimize_universal_style_compaction(
rocksdb_options_t* opt, uint64_t memtable_memory_budget);
extern void rocksdb_options_set_compaction_filter(
rocksdb_options_t*,
rocksdb_compactionfilter_t*);
extern void rocksdb_options_set_compaction_filter_factory(
rocksdb_options_t*, rocksdb_compactionfilterfactory_t*);
extern void rocksdb_options_set_compaction_filter_factory_v2(
rocksdb_options_t*,
rocksdb_compactionfilterfactoryv2_t*);
extern void rocksdb_options_set_comparator(
rocksdb_options_t*,
rocksdb_comparator_t*);
extern void rocksdb_options_set_merge_operator(
rocksdb_options_t*,
rocksdb_mergeoperator_t*);
extern void rocksdb_options_set_compression_per_level(
rocksdb_options_t* opt,
int* level_values,
size_t num_levels);
extern void rocksdb_options_set_filter_policy(
rocksdb_options_t*,
rocksdb_filterpolicy_t*);
extern void rocksdb_options_set_create_if_missing(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_create_missing_column_families(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_error_if_exists(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_paranoid_checks(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*);
extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
extern void rocksdb_options_set_cache_compressed(rocksdb_options_t*, rocksdb_cache_t*);
extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int);
extern void rocksdb_options_set_compression_options(
rocksdb_options_t*, int, int, int);
extern void rocksdb_options_set_whole_key_filtering(rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_prefix_extractor(
rocksdb_options_t*, rocksdb_slicetransform_t*);
extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int);
extern void rocksdb_options_set_level0_file_num_compaction_trigger(
rocksdb_options_t*, int);
extern void rocksdb_options_set_level0_slowdown_writes_trigger(
rocksdb_options_t*, int);
extern void rocksdb_options_set_level0_stop_writes_trigger(
rocksdb_options_t*, int);
extern void rocksdb_options_set_max_mem_compaction_level(
rocksdb_options_t*, int);
extern void rocksdb_options_set_target_file_size_base(
rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_target_file_size_multiplier(
rocksdb_options_t*, int);
extern void rocksdb_options_set_max_bytes_for_level_base(
rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_max_bytes_for_level_multiplier(
rocksdb_options_t*, int);
extern void rocksdb_options_set_expanded_compaction_factor(
rocksdb_options_t*, int);
extern void rocksdb_options_set_max_grandparent_overlap_factor(
rocksdb_options_t*, int);
extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
rocksdb_options_t*, int* level_values, size_t num_levels);
extern void rocksdb_options_enable_statistics(rocksdb_options_t*);
extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int);
extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int);
extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int);
extern void rocksdb_options_set_max_log_file_size(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_keep_log_file_num(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_soft_rate_limit(rocksdb_options_t*, double);
extern void rocksdb_options_set_hard_rate_limit(rocksdb_options_t*, double);
extern void rocksdb_options_set_rate_limit_delay_max_milliseconds(
rocksdb_options_t*, unsigned int);
extern void rocksdb_options_set_max_manifest_file_size(
rocksdb_options_t*, size_t);
extern void rocksdb_options_set_no_block_cache(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_table_cache_numshardbits(
rocksdb_options_t*, int);
extern void rocksdb_options_set_table_cache_remove_scan_count_limit(
rocksdb_options_t*, int);
extern void rocksdb_options_set_arena_block_size(
rocksdb_options_t*, size_t);
extern void rocksdb_options_set_use_fsync(
rocksdb_options_t*, int);
extern void rocksdb_options_set_db_stats_log_interval(
rocksdb_options_t*, int);
extern void rocksdb_options_set_db_log_dir(
rocksdb_options_t*, const char*);
extern void rocksdb_options_set_wal_dir(
rocksdb_options_t*, const char*);
extern void rocksdb_options_set_WAL_ttl_seconds(
rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_WAL_size_limit_MB(
rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_manifest_preallocation_size(
rocksdb_options_t*, size_t);
extern void rocksdb_options_set_purge_redundant_kvs_while_flush(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_allow_os_buffer(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_allow_mmap_reads(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_allow_mmap_writes(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_is_fd_close_on_exec(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_skip_log_error_on_recovery(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_stats_dump_period_sec(
rocksdb_options_t*, unsigned int);
extern void rocksdb_options_set_block_size_deviation(
rocksdb_options_t*, int);
extern void rocksdb_options_set_advise_random_on_open(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_access_hint_on_compaction_start(
rocksdb_options_t*, int);
extern void rocksdb_options_set_use_adaptive_mutex(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_bytes_per_sync(
rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_verify_checksums_in_compaction(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_filter_deletes(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_max_sequential_skip_in_iterations(
rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int);
extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int);
extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int);
extern void rocksdb_options_set_delete_obsolete_files_period_micros(
rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int);
extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*);
extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*);
extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t);
extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t);
extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n);
extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec);
extern void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level);
extern void rocksdb_options_set_memtable_prefix_bloom_bits(
rocksdb_options_t*, uint32_t);
extern void rocksdb_options_set_memtable_prefix_bloom_probes(
rocksdb_options_t*, uint32_t);
extern void rocksdb_options_set_max_successive_merges(
rocksdb_options_t*, size_t);
extern void rocksdb_options_set_min_partial_merge_operands(
rocksdb_options_t*, uint32_t);
extern void rocksdb_options_set_bloom_locality(
rocksdb_options_t*, uint32_t);
extern void rocksdb_options_set_allow_thread_local(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_inplace_update_support(
rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_inplace_update_num_locks(
rocksdb_options_t*, size_t);
enum {
rocksdb_no_compression = 0,
rocksdb_snappy_compression = 1,
rocksdb_zlib_compression = 2,
rocksdb_bz2_compression = 3,
rocksdb_lz4_compression = 4,
rocksdb_lz4hc_compression = 5
};
extern void rocksdb_options_set_compression(rocksdb_options_t*, int);
enum {
rocksdb_level_compaction = 0,
rocksdb_universal_compaction = 1,
rocksdb_fifo_compaction = 2
};
extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int);
extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
extern void rocksdb_options_set_fifo_compaction_options(rocksdb_options_t* opt,
rocksdb_fifo_compaction_options_t* fifo);
/* Compaction Filter */
extern rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
void* state,
void (*destructor)(void*),
unsigned char (*filter)(
void*,
int level,
const char* key, size_t key_length,
const char* existing_value, size_t value_length,
char** new_value, size_t *new_value_length,
unsigned char* value_changed),
const char* (*name)(void*));
extern void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t*);
/* Compaction Filter Context */
extern unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
rocksdb_compactionfiltercontext_t* context);
extern unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
rocksdb_compactionfiltercontext_t* context);
/* Compaction Filter Factory */
extern rocksdb_compactionfilterfactory_t*
rocksdb_compactionfilterfactory_create(
void* state, void (*destructor)(void*),
rocksdb_compactionfilter_t* (*create_compaction_filter)(
void*, rocksdb_compactionfiltercontext_t* context),
const char* (*name)(void*));
extern void rocksdb_compactionfilterfactory_destroy(
rocksdb_compactionfilterfactory_t*);
/* Compaction Filter V2 */
extern rocksdb_compactionfilterv2_t* rocksdb_compactionfilterv2_create(
void* state,
void (*destructor)(void*),
// num_keys specifies the number of array entries in every *list parameter.
// New values added to the new_values_list should be malloc'd and will be
// freed by the caller. Specify true in the to_delete_list to remove an
// entry during compaction; false to keep it.
void (*filter)(
void*, int level, size_t num_keys,
const char* const* keys_list, const size_t* keys_list_sizes,
const char* const* existing_values_list, const size_t* existing_values_list_sizes,
char** new_values_list, size_t* new_values_list_sizes,
unsigned char* to_delete_list),
const char* (*name)(void*));
extern void rocksdb_compactionfilterv2_destroy(rocksdb_compactionfilterv2_t*);
/* Compaction Filter Factory V2 */
extern rocksdb_compactionfilterfactoryv2_t* rocksdb_compactionfilterfactoryv2_create(
void* state,
rocksdb_slicetransform_t* prefix_extractor,
void (*destructor)(void*),
rocksdb_compactionfilterv2_t* (*create_compaction_filter_v2)(
void*, const rocksdb_compactionfiltercontext_t* context),
const char* (*name)(void*));
extern void rocksdb_compactionfilterfactoryv2_destroy(rocksdb_compactionfilterfactoryv2_t*);
/* Comparator */
extern rocksdb_comparator_t* rocksdb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
void*,
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*));
extern void rocksdb_comparator_destroy(rocksdb_comparator_t*);
/* Filter policy */
extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length),
unsigned char (*key_may_match)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length),
void (*delete_filter)(
void*,
const char* filter, size_t filter_length),
const char* (*name)(void*));
extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*);
extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(
int bits_per_key);
/* Merge Operator */
extern rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
void* state,
void (*destructor)(void*),
char* (*full_merge)(
void*,
const char* key, size_t key_length,
const char* existing_value, size_t existing_value_length,
const char* const* operands_list, const size_t* operands_list_length,
int num_operands,
unsigned char* success, size_t* new_value_length),
char* (*partial_merge)(
void*,
const char* key, size_t key_length,
const char* const* operands_list, const size_t* operands_list_length,
int num_operands,
unsigned char* success, size_t* new_value_length),
void (*delete_value)(
void*,
const char* value, size_t value_length),
const char* (*name)(void*));
extern void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t*);
/* Read options */
extern rocksdb_readoptions_t* rocksdb_readoptions_create();
extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*);
extern void rocksdb_readoptions_set_verify_checksums(
rocksdb_readoptions_t*,
unsigned char);
extern void rocksdb_readoptions_set_fill_cache(
rocksdb_readoptions_t*, unsigned char);
extern void rocksdb_readoptions_set_snapshot(
rocksdb_readoptions_t*,
const rocksdb_snapshot_t*);
extern void rocksdb_readoptions_set_read_tier(
rocksdb_readoptions_t*, int);
extern void rocksdb_readoptions_set_tailing(
rocksdb_readoptions_t*, unsigned char);
/* Write options */
extern rocksdb_writeoptions_t* rocksdb_writeoptions_create();
extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*);
extern void rocksdb_writeoptions_set_sync(
rocksdb_writeoptions_t*, unsigned char);
extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable);
/* Flush options */
extern rocksdb_flushoptions_t* rocksdb_flushoptions_create();
extern void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t*);
extern void rocksdb_flushoptions_set_wait(
rocksdb_flushoptions_t*, unsigned char);
/* Cache */
extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity);
extern void rocksdb_cache_destroy(rocksdb_cache_t* cache);
/* Env */
extern rocksdb_env_t* rocksdb_create_default_env();
extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n);
extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
extern void rocksdb_env_destroy(rocksdb_env_t*);
/* SliceTransform */
extern rocksdb_slicetransform_t* rocksdb_slicetransform_create(
void* state,
void (*destructor)(void*),
char* (*transform)(
void*,
const char* key, size_t length,
size_t* dst_length),
unsigned char (*in_domain)(
void*,
const char* key, size_t length),
unsigned char (*in_range)(
void*,
const char* key, size_t length),
const char* (*name)(void*));
extern rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t);
extern void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t*);
/* Universal Compaction options */
enum {
rocksdb_similar_size_compaction_stop_style = 0,
rocksdb_total_size_compaction_stop_style = 1
};
extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ;
extern void rocksdb_universal_compaction_options_set_size_ratio(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_set_min_merge_width(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_set_max_merge_width(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_set_compression_size_percent(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_set_stop_style(
rocksdb_universal_compaction_options_t*, int);
extern void rocksdb_universal_compaction_options_destroy(
rocksdb_universal_compaction_options_t*);
extern rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create();
extern void rocksdb_fifo_compaction_options_set_max_table_files_size(
rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
extern void rocksdb_fifo_compaction_options_destroy(
rocksdb_fifo_compaction_options_t* fifo_opts);
extern int rocksdb_livefiles_count(
const rocksdb_livefiles_t*);
extern const char* rocksdb_livefiles_name(
const rocksdb_livefiles_t*,
int index);
extern int rocksdb_livefiles_level(
const rocksdb_livefiles_t*,
int index);
extern size_t rocksdb_livefiles_size(
const rocksdb_livefiles_t*,
int index);
extern const char* rocksdb_livefiles_smallestkey(
const rocksdb_livefiles_t*,
int index,
size_t* size);
extern const char* rocksdb_livefiles_largestkey(
const rocksdb_livefiles_t*,
int index,
size_t* size);
extern void rocksdb_livefiles_destroy(
const rocksdb_livefiles_t*);
#ifdef __cplusplus
} /* end extern "C" */
#endif
#endif /* STORAGE_ROCKSDB_INCLUDE_C_H_ */

140
include/rocksdb/cache.h Normal file
View File

@@ -0,0 +1,140 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A Cache is an interface that maps keys to values. It has internal
// synchronization and may be safely accessed concurrently from
// multiple threads. It may automatically evict entries to make room
// for new entries. Values have a specified charge against the cache
// capacity. For example, a cache where the values are variable
// length strings, may use the length of the string as the charge for
// the string.
//
// A builtin cache implementation with a least-recently-used eviction
// policy is provided. Clients may use their own implementations if
// they want something more sophisticated (like scan-resistance, a
// custom eviction policy, variable cache sizing, etc.)
#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
#include <memory>
#include <stdint.h>
#include "rocksdb/slice.h"
namespace rocksdb {
using std::shared_ptr;
class Cache;
// Create a new cache with a fixed size capacity. The cache is sharded
// to 2^numShardBits shards, by hash of the key. The total capacity
// is divided and evenly assigned to each shard. Inside each shard,
// the eviction is done in two passes: first try to free spaces by
// evicting entries that are among the most least used removeScanCountLimit
// entries and do not have reference other than by the cache itself, in
// the least-used order. If not enough space is freed, further free the
// entries in least used order.
//
// The functions without parameter numShardBits and/or removeScanCountLimit
// use default values. removeScanCountLimit's default value is 0, which
// means a strict LRU order inside each shard.
extern shared_ptr<Cache> NewLRUCache(size_t capacity);
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
int removeScanCountLimit);
class Cache {
public:
Cache() { }
// Destroys all existing entries by calling the "deleter"
// function that was passed to the constructor.
virtual ~Cache();
// Opaque handle to an entry stored in the cache.
struct Handle { };
// Insert a mapping from key->value into the cache and assign it
// the specified charge against the total cache capacity.
//
// Returns a handle that corresponds to the mapping. The caller
// must call this->Release(handle) when the returned mapping is no
// longer needed.
//
// When the inserted entry is no longer needed, the key and
// value will be passed to "deleter".
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value)) = 0;
// If the cache has no mapping for "key", returns nullptr.
//
// Else return a handle that corresponds to the mapping. The caller
// must call this->Release(handle) when the returned mapping is no
// longer needed.
virtual Handle* Lookup(const Slice& key) = 0;
// Release a mapping returned by a previous Lookup().
// REQUIRES: handle must not have been released yet.
// REQUIRES: handle must have been returned by a method on *this.
virtual void Release(Handle* handle) = 0;
// Return the value encapsulated in a handle returned by a
// successful Lookup().
// REQUIRES: handle must not have been released yet.
// REQUIRES: handle must have been returned by a method on *this.
virtual void* Value(Handle* handle) = 0;
// If the cache contains entry for key, erase it. Note that the
// underlying entry will be kept around until all existing handles
// to it have been released.
virtual void Erase(const Slice& key) = 0;
// Return a new numeric id. May be used by multiple clients who are
// sharing the same cache to partition the key space. Typically the
// client will allocate a new id at startup and prepend the id to
// its cache keys.
virtual uint64_t NewId() = 0;
// returns the maximum configured capacity of the cache
virtual size_t GetCapacity() const = 0;
// returns the memory size for the entries residing in the cache.
virtual size_t GetUsage() const = 0;
// Call this on shutdown if you want to speed it up. Cache will disown
// any underlying data and will not free it on delete. This call will leak
// memory - call this only if you're shutting down the process.
// Any attempts of using cache after this call will fail terribly.
// Always delete the DB object before calling this method!
virtual void DisownData() {
// default implementation is noop
};
// Apply callback to all entries in the cache
// If thread_safe is true, it will also lock the accesses. Otherwise, it will
// access the cache without the lock held
virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
bool thread_safe) = 0;
private:
void LRU_Remove(Handle* e);
void LRU_Append(Handle* e);
void Unref(Handle* e);
struct Rep;
Rep* rep_;
// No copying allowed
Cache(const Cache&);
void operator=(const Cache&);
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_UTIL_CACHE_H_

View File

@@ -0,0 +1,198 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
#include <string>
#include <vector>
namespace rocksdb {
class Slice;
class SliceTransform;
// Context information of a compaction run
struct CompactionFilterContext {
// Does this compaction run include all data files
bool is_full_compaction;
// Is this compaction requested by the client (true),
// or is it occurring as an automatic compaction process
bool is_manual_compaction;
};
// CompactionFilter allows an application to modify/delete a key-value at
// the time of compaction.
class CompactionFilter {
public:
// Context information of a compaction run
struct Context {
// Does this compaction run include all data files
bool is_full_compaction;
// Is this compaction requested by the client (true),
// or is it occurring as an automatic compaction process
bool is_manual_compaction;
};
virtual ~CompactionFilter() {}
// The compaction process invokes this
// method for kv that is being compacted. A return value
// of false indicates that the kv should be preserved in the
// output of this compaction run and a return value of true
// indicates that this key-value should be removed from the
// output of the compaction. The application can inspect
// the existing value of the key and make decision based on it.
//
// When the value is to be preserved, the application has the option
// to modify the existing_value and pass it back through new_value.
// value_changed needs to be set to true in this case.
//
// If multithreaded compaction is being used *and* a single CompactionFilter
// instance was supplied via Options::compaction_filter, this method may be
// called from different threads concurrently. The application must ensure
// that the call is thread-safe.
//
// If the CompactionFilter was created by a factory, then it will only ever
// be used by a single thread that is doing the compaction run, and this
// call does not need to be thread-safe. However, multiple filters may be
// in existence and operating concurrently.
virtual bool Filter(int level,
const Slice& key,
const Slice& existing_value,
std::string* new_value,
bool* value_changed) const = 0;
// Returns a name that identifies this compaction filter.
// The name will be printed to LOG file on start up for diagnosis.
virtual const char* Name() const = 0;
};
// CompactionFilterV2 that buffers kv pairs sharing the same prefix and let
// application layer to make individual decisions for all the kv pairs in the
// buffer.
class CompactionFilterV2 {
public:
virtual ~CompactionFilterV2() {}
// The compaction process invokes this method for all the kv pairs
// sharing the same prefix. It is a "roll-up" version of CompactionFilter.
//
// Each entry in the return vector indicates if the corresponding kv should
// be preserved in the output of this compaction run. The application can
// inspect the existing values of the keys and make decision based on it.
//
// When a value is to be preserved, the application has the option
// to modify the entry in existing_values and pass it back through an entry
// in new_values. A corresponding values_changed entry needs to be set to
// true in this case. Note that the new_values vector contains only changed
// values, i.e. new_values.size() <= values_changed.size().
//
typedef std::vector<Slice> SliceVector;
virtual std::vector<bool> Filter(int level,
const SliceVector& keys,
const SliceVector& existing_values,
std::vector<std::string>* new_values,
std::vector<bool>* values_changed)
const = 0;
// Returns a name that identifies this compaction filter.
// The name will be printed to LOG file on start up for diagnosis.
virtual const char* Name() const = 0;
};
// Each compaction will create a new CompactionFilter allowing the
// application to know about different compactions
class CompactionFilterFactory {
public:
virtual ~CompactionFilterFactory() { }
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context& context) = 0;
// Returns a name that identifies this compaction filter factory.
virtual const char* Name() const = 0;
};
// Default implementation of CompactionFilterFactory which does not
// return any filter
class DefaultCompactionFilterFactory : public CompactionFilterFactory {
public:
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context& context) override {
return std::unique_ptr<CompactionFilter>(nullptr);
}
virtual const char* Name() const override {
return "DefaultCompactionFilterFactory";
}
};
// Each compaction will create a new CompactionFilterV2
//
// CompactionFilterFactoryV2 enables application to specify a prefix and use
// CompactionFilterV2 to filter kv-pairs in batches. Each batch contains all
// the kv-pairs sharing the same prefix.
//
// This is useful for applications that require grouping kv-pairs in
// compaction filter to make a purge/no-purge decision. For example, if the
// key prefix is user id and the rest of key represents the type of value.
// This batching filter will come in handy if the application's compaction
// filter requires knowledge of all types of values for any user id.
//
class CompactionFilterFactoryV2 {
public:
// NOTE: CompactionFilterFactoryV2 will not delete prefix_extractor
explicit CompactionFilterFactoryV2(const SliceTransform* prefix_extractor)
: prefix_extractor_(prefix_extractor) { }
virtual ~CompactionFilterFactoryV2() { }
virtual std::unique_ptr<CompactionFilterV2> CreateCompactionFilterV2(
const CompactionFilterContext& context) = 0;
// Returns a name that identifies this compaction filter factory.
virtual const char* Name() const = 0;
const SliceTransform* GetPrefixExtractor() const {
return prefix_extractor_;
}
void SetPrefixExtractor(const SliceTransform* prefix_extractor) {
prefix_extractor_ = prefix_extractor;
}
private:
// Prefix extractor for compaction filter v2
// Keys sharing the same prefix will be buffered internally.
// Client can implement a Filter callback function to operate on the buffer
const SliceTransform* prefix_extractor_;
};
// Default implementation of CompactionFilterFactoryV2 which does not
// return any filter
class DefaultCompactionFilterFactoryV2 : public CompactionFilterFactoryV2 {
public:
explicit DefaultCompactionFilterFactoryV2()
: CompactionFilterFactoryV2(nullptr) { }
virtual std::unique_ptr<CompactionFilterV2>
CreateCompactionFilterV2(
const CompactionFilterContext& context) override {
return std::unique_ptr<CompactionFilterV2>(nullptr);
}
virtual const char* Name() const override {
return "DefaultCompactionFilterFactoryV2";
}
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_

View File

@@ -0,0 +1,67 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
#include <string>
namespace rocksdb {
class Slice;
// A Comparator object provides a total order across slices that are
// used as keys in an sstable or a database. A Comparator implementation
// must be thread-safe since rocksdb may invoke its methods concurrently
// from multiple threads.
class Comparator {
public:
virtual ~Comparator();
// Three-way comparison. Returns value:
// < 0 iff "a" < "b",
// == 0 iff "a" == "b",
// > 0 iff "a" > "b"
virtual int Compare(const Slice& a, const Slice& b) const = 0;
// The name of the comparator. Used to check for comparator
// mismatches (i.e., a DB created with one comparator is
// accessed using a different comparator.
//
// The client of this package should switch to a new name whenever
// the comparator implementation changes in a way that will cause
// the relative ordering of any two keys to change.
//
// Names starting with "rocksdb." are reserved and should not be used
// by any clients of this package.
virtual const char* Name() const = 0;
// Advanced functions: these are used to reduce the space requirements
// for internal data structures like index blocks.
// If *start < limit, changes *start to a short string in [start,limit).
// Simple comparator implementations may return with *start unchanged,
// i.e., an implementation of this method that does nothing is correct.
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const = 0;
// Changes *key to a short string >= *key.
// Simple comparator implementations may return with *key unchanged,
// i.e., an implementation of this method that does nothing is correct.
virtual void FindShortSuccessor(std::string* key) const = 0;
};
// Return a builtin comparator that uses lexicographic byte-wise
// ordering. The result remains the property of this module and
// must not be deleted.
extern const Comparator* BytewiseComparator();
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_

507
include/rocksdb/db.h Normal file
View File

@@ -0,0 +1,507 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
#define STORAGE_ROCKSDB_INCLUDE_DB_H_
#include <stdint.h>
#include <stdio.h>
#include <memory>
#include <vector>
#include <string>
#include <unordered_map>
#include "rocksdb/version.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "rocksdb/types.h"
#include "rocksdb/transaction_log.h"
namespace rocksdb {
using std::unique_ptr;
class ColumnFamilyHandle {
public:
virtual ~ColumnFamilyHandle() {}
};
extern const std::string kDefaultColumnFamilyName;
struct ColumnFamilyDescriptor {
std::string name;
ColumnFamilyOptions options;
ColumnFamilyDescriptor()
: name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
ColumnFamilyDescriptor(const std::string& _name,
const ColumnFamilyOptions& _options)
: name(_name), options(_options) {}
};
static const int kMajorVersion = __ROCKSDB_MAJOR__;
static const int kMinorVersion = __ROCKSDB_MINOR__;
struct Options;
struct ReadOptions;
struct WriteOptions;
struct FlushOptions;
struct TableProperties;
class WriteBatch;
class Env;
// Metadata associated with each SST file.
struct LiveFileMetaData {
std::string column_family_name; // Name of the column family
std::string db_path;
std::string name; // Name of the file
int level; // Level at which this file resides.
size_t size; // File size in bytes.
std::string smallestkey; // Smallest user defined key in the file.
std::string largestkey; // Largest user defined key in the file.
SequenceNumber smallest_seqno; // smallest seqno in file
SequenceNumber largest_seqno; // largest seqno in file
};
// Abstract handle to particular state of a DB.
// A Snapshot is an immutable object and can therefore be safely
// accessed from multiple threads without any external synchronization.
class Snapshot {
protected:
virtual ~Snapshot();
};
// A range of keys
struct Range {
Slice start; // Included in the range
Slice limit; // Not included in the range
Range() { }
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
};
// A collections of table properties objects, where
// key: is the table's file name.
// value: the table properties object of the given table.
typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
TablePropertiesCollection;
// A DB is a persistent ordered map from keys to values.
// A DB is safe for concurrent access from multiple threads without
// any external synchronization.
class DB {
public:
// Open the database with the specified "name".
// Stores a pointer to a heap-allocated database in *dbptr and returns
// OK on success.
// Stores nullptr in *dbptr and returns a non-OK status on error.
// Caller should delete *dbptr when it is no longer needed.
static Status Open(const Options& options,
const std::string& name,
DB** dbptr);
// Open the database for read only. All DB interfaces
// that modify data, like put/delete, will return error.
// If the db is opened in read only mode, then no compactions
// will happen.
static Status OpenForReadOnly(const Options& options,
const std::string& name, DB** dbptr,
bool error_if_log_file_exist = false);
// Open the database for read only with column families. When opening DB with
// read only, you can specify only a subset of column families in the
// database that should be opened. However, you always need to specify default
// column family. The default column family name is 'default' and it's stored
// in rocksdb::kDefaultColumnFamilyName
static Status OpenForReadOnly(
const DBOptions& db_options, const std::string& name,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
bool error_if_log_file_exist = false);
// Open DB with column families.
// db_options specify database specific options
// column_families is the vector of all column families in the databse,
// containing column family name and options. You need to open ALL column
// families in the database. To get the list of column families, you can use
// ListColumnFamilies(). Also, you can open only a subset of column families
// for read-only access.
// The default column family name is 'default' and it's stored
// in rocksdb::kDefaultColumnFamilyName.
// If everything is OK, handles will on return be the same size
// as column_families --- handles[i] will be a handle that you
// will use to operate on column family column_family[i]
static Status Open(const DBOptions& db_options, const std::string& name,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
// ListColumnFamilies will open the DB specified by argument name
// and return the list of all column families in that DB
// through column_families argument. The ordering of
// column families in column_families is unspecified.
static Status ListColumnFamilies(const DBOptions& db_options,
const std::string& name,
std::vector<std::string>* column_families);
DB() { }
virtual ~DB();
// Create a column_family and return the handle of column family
// through the argument handle.
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
const std::string& column_family_name,
ColumnFamilyHandle** handle);
// Drop a column family specified by column_family handle. This call
// only records a drop record in the manifest and prevents the column
// family from flushing and compacting.
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
// Set the database entry for "key" to "value".
// If "key" already exists, it will be overwritten.
// Returns OK on success, and a non-OK status on error.
// Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) = 0;
virtual Status Put(const WriteOptions& options, const Slice& key,
const Slice& value) {
return Put(options, DefaultColumnFamily(), key, value);
}
// Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key"
// did not exist in the database.
// Note: consider setting options.sync = true.
virtual Status Delete(const WriteOptions& options,
ColumnFamilyHandle* column_family,
const Slice& key) = 0;
virtual Status Delete(const WriteOptions& options, const Slice& key) {
return Delete(options, DefaultColumnFamily(), key);
}
// Merge the database entry for "key" with "value". Returns OK on success,
// and a non-OK status on error. The semantics of this operation is
// determined by the user provided merge_operator when opening DB.
// Note: consider setting options.sync = true.
virtual Status Merge(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) = 0;
virtual Status Merge(const WriteOptions& options, const Slice& key,
const Slice& value) {
return Merge(options, DefaultColumnFamily(), key, value);
}
// Apply the specified updates to the database.
// Returns OK on success, non-OK on failure.
// Note: consider setting options.sync = true.
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
// If the database contains an entry for "key" store the
// corresponding value in *value and return OK.
//
// If there is no entry for "key" leave *value unchanged and return
// a status for which Status::IsNotFound() returns true.
//
// May return some other Status on an error.
virtual Status Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) = 0;
virtual Status Get(const ReadOptions& options, const Slice& key, std::string* value) {
return Get(options, DefaultColumnFamily(), key, value);
}
// If keys[i] does not exist in the database, then the i'th returned
// status will be one for which Status::IsNotFound() is true, and
// (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
// the i'th returned status will have Status::ok() true, and (*values)[i]
// will store the value associated with keys[i].
//
// (*values) will always be resized to be the same size as (keys).
// Similarly, the number of returned statuses will be the number of keys.
// Note: keys will not be "de-duplicated". Duplicate keys will return
// duplicate values in order.
virtual std::vector<Status> MultiGet(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
virtual std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<Slice>& keys,
std::vector<std::string>* values) {
return MultiGet(options, std::vector<ColumnFamilyHandle*>(
keys.size(), DefaultColumnFamily()),
keys, values);
}
// If the key definitely does not exist in the database, then this method
// returns false, else true. If the caller wants to obtain value when the key
// is found in memory, a bool for 'value_found' must be passed. 'value_found'
// will be true on return if value has been set properly.
// This check is potentially lighter-weight than invoking DB::Get(). One way
// to make this lighter weight is to avoid doing any IOs.
// Default implementation here returns true and sets 'value_found' to false
virtual bool KeyMayExist(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value, bool* value_found = nullptr) {
if (value_found != nullptr) {
*value_found = false;
}
return true;
}
virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
std::string* value, bool* value_found = nullptr) {
return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
}
// Return a heap-allocated iterator over the contents of the database.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
//
// Caller should delete the iterator when it is no longer needed.
// The returned iterator should be deleted before this db is deleted.
virtual Iterator* NewIterator(const ReadOptions& options,
ColumnFamilyHandle* column_family) = 0;
virtual Iterator* NewIterator(const ReadOptions& options) {
return NewIterator(options, DefaultColumnFamily());
}
// Returns iterators from a consistent database state across multiple
// column families. Iterators are heap allocated and need to be deleted
// before the db is deleted
virtual Status NewIterators(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_families,
std::vector<Iterator*>* iterators) = 0;
// Return a handle to the current DB state. Iterators created with
// this handle will all observe a stable snapshot of the current DB
// state. The caller must call ReleaseSnapshot(result) when the
// snapshot is no longer needed.
//
// nullptr will be returned if the DB fails to take a snapshot or does
// not support snapshot.
virtual const Snapshot* GetSnapshot() = 0;
// Release a previously acquired snapshot. The caller must not
// use "snapshot" after this call.
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
// DB implementations can export properties about their state
// via this method. If "property" is a valid property understood by this
// DB implementation, fills "*value" with its current value and returns
// true. Otherwise returns false.
//
//
// Valid property names include:
//
// "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
// where <N> is an ASCII representation of a level number (e.g. "0").
// "rocksdb.stats" - returns a multi-line string that describes statistics
// about the internal operation of the DB.
// "rocksdb.sstables" - returns a multi-line string that describes all
// of the sstables that make up the db contents.
virtual bool GetProperty(ColumnFamilyHandle* column_family,
const Slice& property, std::string* value) = 0;
virtual bool GetProperty(const Slice& property, std::string* value) {
return GetProperty(DefaultColumnFamily(), property, value);
}
// Similar to GetProperty(), but only works for a subset of properties whose
// return value is an integer. Return the value by integer.
virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
const Slice& property, uint64_t* value) = 0;
virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
return GetIntProperty(DefaultColumnFamily(), property, value);
}
// For each i in [0,n-1], store in "sizes[i]", the approximate
// file system space used by keys in "[range[i].start .. range[i].limit)".
//
// Note that the returned sizes measure file system space usage, so
// if the user data compresses by a factor of ten, the returned
// sizes will be one-tenth the size of the corresponding user data size.
//
// The results may not include the sizes of recently written data.
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* range, int n,
uint64_t* sizes) = 0;
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) {
GetApproximateSizes(DefaultColumnFamily(), range, n, sizes);
}
// Compact the underlying storage for the key range [*begin,*end].
// The actual compaction interval might be superset of [*begin, *end].
// In particular, deleted and overwritten versions are discarded,
// and the data is rearranged to reduce the cost of operations
// needed to access the data. This operation should typically only
// be invoked by users who understand the underlying implementation.
//
// begin==nullptr is treated as a key before all keys in the database.
// end==nullptr is treated as a key after all keys in the database.
// Therefore the following call will compact the entire database:
// db->CompactRange(nullptr, nullptr);
// Note that after the entire database is compacted, all data are pushed
// down to the last level containing any data. If the total data size
// after compaction is reduced, that level might not be appropriate for
// hosting all the files. In this case, client could set reduce_level
// to true, to move the files back to the minimum level capable of holding
// the data set or a given level (specified by non-negative target_level).
// Compaction outputs should be placed in options.db_paths[target_path_id].
// Behavior is undefined if target_path_id is out of range.
virtual Status CompactRange(ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end,
bool reduce_level = false, int target_level = -1,
uint32_t target_path_id = 0) = 0;
virtual Status CompactRange(const Slice* begin, const Slice* end,
bool reduce_level = false, int target_level = -1,
uint32_t target_path_id = 0) {
return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
target_level, target_path_id);
}
// Number of levels used for this DB.
virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
// Maximum level to which a new compacted memtable is pushed if it
// does not create overlap.
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
virtual int MaxMemCompactionLevel() {
return MaxMemCompactionLevel(DefaultColumnFamily());
}
// Number of files in level-0 that would stop writes.
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
virtual int Level0StopWriteTrigger() {
return Level0StopWriteTrigger(DefaultColumnFamily());
}
// Get DB name -- the exact same name that was provided as an argument to
// DB::Open()
virtual const std::string& GetName() const = 0;
// Get Env object from the DB
virtual Env* GetEnv() const = 0;
// Get DB Options that we use
virtual const Options& GetOptions(ColumnFamilyHandle* column_family)
const = 0;
virtual const Options& GetOptions() const {
return GetOptions(DefaultColumnFamily());
}
// Flush all mem-table data.
virtual Status Flush(const FlushOptions& options,
ColumnFamilyHandle* column_family) = 0;
virtual Status Flush(const FlushOptions& options) {
return Flush(options, DefaultColumnFamily());
}
// The sequence number of the most recent transaction.
virtual SequenceNumber GetLatestSequenceNumber() const = 0;
#ifndef ROCKSDB_LITE
// Prevent file deletions. Compactions will continue to occur,
// but no obsolete files will be deleted. Calling this multiple
// times have the same effect as calling it once.
virtual Status DisableFileDeletions() = 0;
// Allow compactions to delete obsolete files.
// If force == true, the call to EnableFileDeletions() will guarantee that
// file deletions are enabled after the call, even if DisableFileDeletions()
// was called multiple times before.
// If force == false, EnableFileDeletions will only enable file deletion
// after it's been called at least as many times as DisableFileDeletions(),
// enabling the two methods to be called by two threads concurrently without
// synchronization -- i.e., file deletions will be enabled only after both
// threads call EnableFileDeletions()
virtual Status EnableFileDeletions(bool force = true) = 0;
// GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
// THIS METHOD IS DEPRECATED. Use the GetLiveFilesMetaData to get more
// detailed information on the live files.
// Retrieve the list of all files in the database. The files are
// relative to the dbname and are not absolute paths. The valid size of the
// manifest file is returned in manifest_file_size. The manifest file is an
// ever growing file, but only the portion specified by manifest_file_size is
// valid for this snapshot.
// Setting flush_memtable to true does Flush before recording the live files.
// Setting flush_memtable to false is useful when we don't want to wait for
// flush which may have to wait for compaction to complete taking an
// indeterminate time.
//
// In case you have multiple column families, even if flush_memtable is true,
// you still need to call GetSortedWalFiles after GetLiveFiles to compensate
// for new data that arrived to already-flushed column families while other
// column families were flushing
virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size,
bool flush_memtable = true) = 0;
// Retrieve the sorted list of all wal files with earliest file first
virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
// Sets iter to an iterator that is positioned at a write-batch containing
// seq_number. If the sequence number is non existent, it returns an iterator
// at the first available seq_no after the requested seq_no
// Returns Status::OK if iterator is valid
// Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
// use this api, else the WAL files will get
// cleared aggressively and the iterator might keep getting invalid before
// an update is read.
virtual Status GetUpdatesSince(
SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
const TransactionLogIterator::ReadOptions&
read_options = TransactionLogIterator::ReadOptions()) = 0;
// Delete the file name from the db directory and update the internal state to
// reflect that. Supports deletion of sst and log files only. 'name' must be
// path relative to the db directory. eg. 000001.sst, /archive/000003.log
virtual Status DeleteFile(std::string name) = 0;
// Returns a list of all table files with their level, start key
// and end key
virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {}
#endif // ROCKSDB_LITE
// Sets the globally unique ID created at database creation time by invoking
// Env::GenerateUniqueId(), in identity. Returns Status::OK if identity could
// be set properly
virtual Status GetDbIdentity(std::string& identity) = 0;
// Returns default column family handle
virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
#ifndef ROCKSDB_LITE
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
TablePropertiesCollection* props) = 0;
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
}
#endif // ROCKSDB_LITE
private:
// No copying allowed
DB(const DB&);
void operator=(const DB&);
};
// Destroy the contents of the specified database.
// Be very careful using this method.
Status DestroyDB(const std::string& name, const Options& options);
#ifndef ROCKSDB_LITE
// If a DB cannot be opened, you may attempt to call this method to
// resurrect as much of the contents of the database as possible.
// Some data may be lost, so be careful when calling this function
// on a database that contains important information.
Status RepairDB(const std::string& dbname, const Options& options);
#endif
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_DB_H_

798
include/rocksdb/env.h Normal file
View File

@@ -0,0 +1,798 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An Env is an interface used by the rocksdb implementation to access
// operating system functionality like the filesystem etc. Callers
// may wish to provide a custom Env object when opening a database to
// get fine gain control; e.g., to rate limit file system operations.
//
// All Env implementations are safe for concurrent access from
// multiple threads without any external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
#include <cstdarg>
#include <string>
#include <memory>
#include <vector>
#include <stdint.h>
#include "rocksdb/status.h"
namespace rocksdb {
class FileLock;
class Logger;
class RandomAccessFile;
class SequentialFile;
class Slice;
class WritableFile;
class RandomRWFile;
class Directory;
struct DBOptions;
class RateLimiter;
using std::unique_ptr;
using std::shared_ptr;
// Options while opening a file to read/write
struct EnvOptions {
// construct with default Options
EnvOptions();
// construct from Options
explicit EnvOptions(const DBOptions& options);
// If true, then allow caching of data in environment buffers
bool use_os_buffer = true;
// If true, then use mmap to read data
bool use_mmap_reads = false;
// If true, then use mmap to write data
bool use_mmap_writes = true;
// If true, set the FD_CLOEXEC on open fd.
bool set_fd_cloexec = true;
// Allows OS to incrementally sync files to disk while they are being
// written, in the background. Issue one request for every bytes_per_sync
// written. 0 turns it off.
// Default: 0
uint64_t bytes_per_sync = 0;
// If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
// means that file size won't change as part of preallocation.
// If false, preallocation will also change the file size. This option will
// improve the performance in workloads where you sync the data on every
// write. By default, we set it to true for MANIFEST writes and false for
// WAL writes
bool fallocate_with_keep_size = true;
// If not nullptr, write rate limiting is enabled for flush and compaction
RateLimiter* rate_limiter = nullptr;
};
class Env {
public:
Env() { }
virtual ~Env();
// Return a default environment suitable for the current operating
// system. Sophisticated users may wish to provide their own Env
// implementation instead of relying on this default environment.
//
// The result of Default() belongs to rocksdb and must never be deleted.
static Env* Default();
// Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK.
// On failure stores nullptr in *result and returns non-OK. If the file does
// not exist, returns a non-OK status.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewSequentialFile(const std::string& fname,
unique_ptr<SequentialFile>* result,
const EnvOptions& options)
= 0;
// Create a brand new random access read-only file with the
// specified name. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK. If the file does not exist, returns a non-OK
// status.
//
// The returned file may be concurrently accessed by multiple threads.
virtual Status NewRandomAccessFile(const std::string& fname,
unique_ptr<RandomAccessFile>* result,
const EnvOptions& options)
= 0;
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
// new file. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewWritableFile(const std::string& fname,
unique_ptr<WritableFile>* result,
const EnvOptions& options) = 0;
// Create an object that both reads and writes to a file on
// specified offsets (random access). If file already exists,
// does not overwrite it. On success, stores a pointer to the
// new file in *result and returns OK. On failure stores nullptr
// in *result and returns non-OK.
virtual Status NewRandomRWFile(const std::string& fname,
unique_ptr<RandomRWFile>* result,
const EnvOptions& options) = 0;
// Create an object that represents a directory. Will fail if directory
// doesn't exist. If the directory exists, it will open the directory
// and create a new Directory object.
//
// On success, stores a pointer to the new Directory in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK.
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) = 0;
// Returns true iff the named file exists.
virtual bool FileExists(const std::string& fname) = 0;
// Store in *result the names of the children of the specified directory.
// The names are relative to "dir".
// Original contents of *results are dropped.
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) = 0;
// Delete the named file.
virtual Status DeleteFile(const std::string& fname) = 0;
// Create the specified directory. Returns error if directory exists.
virtual Status CreateDir(const std::string& dirname) = 0;
// Creates directory if missing. Return Ok if it exists, or successful in
// Creating.
virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
// Delete the specified directory.
virtual Status DeleteDir(const std::string& dirname) = 0;
// Store the size of fname in *file_size.
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
// Store the last modification time of fname in *file_mtime.
virtual Status GetFileModificationTime(const std::string& fname,
uint64_t* file_mtime) = 0;
// Rename file src to target.
virtual Status RenameFile(const std::string& src,
const std::string& target) = 0;
// Lock the specified file. Used to prevent concurrent access to
// the same db by multiple processes. On failure, stores nullptr in
// *lock and returns non-OK.
//
// On success, stores a pointer to the object that represents the
// acquired lock in *lock and returns OK. The caller should call
// UnlockFile(*lock) to release the lock. If the process exits,
// the lock will be automatically released.
//
// If somebody else already holds the lock, finishes immediately
// with a failure. I.e., this call does not wait for existing locks
// to go away.
//
// May create the named file if it does not already exist.
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
// Release the lock acquired by a previous successful call to LockFile.
// REQUIRES: lock was returned by a successful LockFile() call
// REQUIRES: lock has not already been unlocked.
virtual Status UnlockFile(FileLock* lock) = 0;
// Priority for scheduling job in thread pool
enum Priority { LOW, HIGH, TOTAL };
// Priority for scheduling job in thread pool
enum IOPriority {
IO_LOW = 0,
IO_HIGH = 1,
IO_TOTAL = 2
};
// Arrange to run "(*function)(arg)" once in a background thread, in
// the thread pool specified by pri. By default, jobs go to the 'LOW'
// priority thread pool.
// "function" may run in an unspecified thread. Multiple functions
// added to the same Env may run concurrently in different threads.
// I.e., the caller may not assume that background work items are
// serialized.
virtual void Schedule(
void (*function)(void* arg),
void* arg,
Priority pri = LOW) = 0;
// Start a new thread, invoking "function(arg)" within the new thread.
// When "function(arg)" returns, the thread will be destroyed.
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
// Wait for all threads started by StartThread to terminate.
virtual void WaitForJoin() {}
// Get thread pool queue length for specific thrad pool.
virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
return 0;
}
// *path is set to a temporary directory that can be used for testing. It may
// or many not have just been created. The directory may or may not differ
// between runs of the same process, but subsequent calls will return the
// same directory.
virtual Status GetTestDirectory(std::string* path) = 0;
// Create and return a log file for storing informational messages.
virtual Status NewLogger(const std::string& fname,
shared_ptr<Logger>* result) = 0;
// Returns the number of micro-seconds since some fixed point in time. Only
// useful for computing deltas of time.
virtual uint64_t NowMicros() = 0;
// Returns the number of nano-seconds since some fixed point in time. Only
// useful for computing deltas of time in one run.
// Default implementation simply relies on NowMicros
virtual uint64_t NowNanos() {
return NowMicros() * 1000;
}
// Sleep/delay the thread for the perscribed number of micro-seconds.
virtual void SleepForMicroseconds(int micros) = 0;
// Get the current host name.
virtual Status GetHostName(char* name, uint64_t len) = 0;
// Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
virtual Status GetCurrentTime(int64_t* unix_time) = 0;
// Get full directory name for this db.
virtual Status GetAbsolutePath(const std::string& db_path,
std::string* output_path) = 0;
// The number of background worker threads of a specific thread pool
// for this environment. 'LOW' is the default pool.
// default number: 1
virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
// Converts seconds-since-Jan-01-1970 to a printable string
virtual std::string TimeToString(uint64_t time) = 0;
// Generates a unique id that can be used to identify a db
virtual std::string GenerateUniqueId();
// OptimizeForLogWrite will create a new EnvOptions object that is a copy of
// the EnvOptions in the parameters, but is optimized for writing log files.
// Default implementation returns the copy of the same object.
virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const;
// OptimizeForManifestWrite will create a new EnvOptions object that is a copy
// of the EnvOptions in the parameters, but is optimized for writing manifest
// files. Default implementation returns the copy of the same object.
virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options)
const;
private:
// No copying allowed
Env(const Env&);
void operator=(const Env&);
};
// A file abstraction for reading sequentially through a file
class SequentialFile {
public:
SequentialFile() { }
virtual ~SequentialFile();
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
// written by this routine. Sets "*result" to the data that was
// read (including if fewer than "n" bytes were successfully read).
// May set "*result" to point at data in "scratch[0..n-1]", so
// "scratch[0..n-1]" must be live when "*result" is used.
// If an error was encountered, returns a non-OK status.
//
// REQUIRES: External synchronization
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
// Skip "n" bytes from the file. This is guaranteed to be no
// slower that reading the same data, but may be faster.
//
// If end of file is reached, skipping will stop at the end of the
// file, and Skip will return OK.
//
// REQUIRES: External synchronization
virtual Status Skip(uint64_t n) = 0;
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
virtual Status InvalidateCache(size_t offset, size_t length) {
return Status::NotSupported("InvalidateCache not supported.");
}
};
// A file abstraction for randomly reading the contents of a file.
class RandomAccessFile {
public:
RandomAccessFile() { }
virtual ~RandomAccessFile();
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
// Tries to get an unique ID for this file that will be the same each time
// the file is opened (and will stay the same while the file is open).
// Furthermore, it tries to make this ID at most "max_size" bytes. If such an
// ID can be created this function returns the length of the ID and places it
// in "id"; otherwise, this function returns 0, in which case "id"
// may not have been modified.
//
// This function guarantees, for IDs from a given environment, two unique ids
// cannot be made equal to eachother by adding arbitrary bytes to one of
// them. That is, no unique ID is the prefix of another.
//
// This function guarantees that the returned ID will not be interpretable as
// a single varint.
//
// Note: these IDs are only valid for the duration of the process.
virtual size_t GetUniqueId(char* id, size_t max_size) const {
return 0; // Default implementation to prevent issues with backwards
// compatibility.
};
enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
virtual void Hint(AccessPattern pattern) {}
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
virtual Status InvalidateCache(size_t offset, size_t length) {
return Status::NotSupported("InvalidateCache not supported.");
}
};
// A file abstraction for sequential writing. The implementation
// must provide buffering since callers may append small fragments
// at a time to the file.
class WritableFile {
public:
WritableFile()
: last_preallocated_block_(0),
preallocation_block_size_(0),
io_priority_(Env::IO_TOTAL) {
}
virtual ~WritableFile();
virtual Status Append(const Slice& data) = 0;
virtual Status Close() = 0;
virtual Status Flush() = 0;
virtual Status Sync() = 0; // sync data
/*
* Sync data and/or metadata as well.
* By default, sync only data.
* Override this method for environments where we need to sync
* metadata as well.
*/
virtual Status Fsync() {
return Sync();
}
/*
* Change the priority in rate limiter if rate limiting is enabled.
* If rate limiting is not enabled, this call has no effect.
*/
virtual void SetIOPriority(Env::IOPriority pri) {
io_priority_ = pri;
}
/*
* Get the size of valid data in the file.
*/
virtual uint64_t GetFileSize() {
return 0;
}
/*
* Get and set the default pre-allocation block size for writes to
* this file. If non-zero, then Allocate will be used to extend the
* underlying storage of a file (generally via fallocate) if the Env
* instance supports it.
*/
void SetPreallocationBlockSize(size_t size) {
preallocation_block_size_ = size;
}
virtual void GetPreallocationStatus(size_t* block_size,
size_t* last_allocated_block) {
*last_allocated_block = last_preallocated_block_;
*block_size = preallocation_block_size_;
}
// For documentation, refer to RandomAccessFile::GetUniqueId()
virtual size_t GetUniqueId(char* id, size_t max_size) const {
return 0; // Default implementation to prevent issues with backwards
}
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
// This call has no effect on dirty pages in the cache.
virtual Status InvalidateCache(size_t offset, size_t length) {
return Status::NotSupported("InvalidateCache not supported.");
}
protected:
// PrepareWrite performs any necessary preparation for a write
// before the write actually occurs. This allows for pre-allocation
// of space on devices where it can result in less file
// fragmentation and/or less waste from over-zealous filesystem
// pre-allocation.
void PrepareWrite(size_t offset, size_t len) {
if (preallocation_block_size_ == 0) {
return;
}
// If this write would cross one or more preallocation blocks,
// determine what the last preallocation block necesessary to
// cover this write would be and Allocate to that point.
const auto block_size = preallocation_block_size_;
size_t new_last_preallocated_block =
(offset + len + block_size - 1) / block_size;
if (new_last_preallocated_block > last_preallocated_block_) {
size_t num_spanned_blocks =
new_last_preallocated_block - last_preallocated_block_;
Allocate(block_size * last_preallocated_block_,
block_size * num_spanned_blocks);
last_preallocated_block_ = new_last_preallocated_block;
}
}
/*
* Pre-allocate space for a file.
*/
virtual Status Allocate(off_t offset, off_t len) {
return Status::OK();
}
// Sync a file range with disk.
// offset is the starting byte of the file range to be synchronized.
// nbytes specifies the length of the range to be synchronized.
// This asks the OS to initiate flushing the cached data to disk,
// without waiting for completion.
// Default implementation does nothing.
virtual Status RangeSync(off_t offset, off_t nbytes) {
return Status::OK();
}
private:
size_t last_preallocated_block_;
size_t preallocation_block_size_;
// No copying allowed
WritableFile(const WritableFile&);
void operator=(const WritableFile&);
protected:
Env::IOPriority io_priority_;
};
// A file abstraction for random reading and writing.
class RandomRWFile {
public:
RandomRWFile() {}
virtual ~RandomRWFile() {}
// Write data from Slice data to file starting from offset
// Returns IOError on failure, but does not guarantee
// atomicity of a write. Returns OK status on success.
//
// Safe for concurrent use.
virtual Status Write(uint64_t offset, const Slice& data) = 0;
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
virtual Status Close() = 0; // closes the file
virtual Status Sync() = 0; // sync data
/*
* Sync data and/or metadata as well.
* By default, sync only data.
* Override this method for environments where we need to sync
* metadata as well.
*/
virtual Status Fsync() {
return Sync();
}
/*
* Pre-allocate space for a file.
*/
virtual Status Allocate(off_t offset, off_t len) {
return Status::OK();
}
private:
// No copying allowed
RandomRWFile(const RandomRWFile&);
void operator=(const RandomRWFile&);
};
// Directory object represents collection of files and implements
// filesystem operations that can be executed on directories.
class Directory {
public:
virtual ~Directory() {}
// Fsync directory
virtual Status Fsync() = 0;
};
enum InfoLogLevel : unsigned char {
DEBUG_LEVEL = 0,
INFO_LEVEL,
WARN_LEVEL,
ERROR_LEVEL,
FATAL_LEVEL,
NUM_INFO_LOG_LEVELS,
};
// An interface for writing log messages.
class Logger {
public:
enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
: log_level_(log_level) {}
virtual ~Logger();
// Write an entry to the log file with the specified format.
virtual void Logv(const char* format, va_list ap) = 0;
// Write an entry to the log file with the specified log level
// and format. Any log with level under the internal log level
// of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
// printed.
void Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN",
"ERROR", "FATAL"};
if (log_level < log_level_) {
return;
}
if (log_level == InfoLogLevel::INFO_LEVEL) {
// Doesn't print log level if it is INFO level.
// This is to avoid unexpected performance regression after we add
// the feature of log level. All the logs before we add the feature
// are INFO level. We don't want to add extra costs to those existing
// logging.
Logv(format, ap);
} else {
char new_format[500];
snprintf(new_format, sizeof(new_format) - 1, "[%s] %s",
kInfoLogLevelNames[log_level], format);
Logv(new_format, ap);
}
}
virtual size_t GetLogFileSize() const {
return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
}
// Flush to the OS buffers
virtual void Flush() {}
virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
virtual void SetInfoLogLevel(const InfoLogLevel log_level) {
log_level_ = log_level;
}
private:
// No copying allowed
Logger(const Logger&);
void operator=(const Logger&);
InfoLogLevel log_level_;
};
// Identifies a locked file.
class FileLock {
public:
FileLock() { }
virtual ~FileLock();
private:
// No copying allowed
FileLock(const FileLock&);
void operator=(const FileLock&);
};
extern void LogFlush(const shared_ptr<Logger>& info_log);
extern void Log(const InfoLogLevel log_level,
const shared_ptr<Logger>& info_log, const char* format, ...);
// a set of log functions with different log levels.
extern void Debug(const shared_ptr<Logger>& info_log, const char* format, ...);
extern void Info(const shared_ptr<Logger>& info_log, const char* format, ...);
extern void Warn(const shared_ptr<Logger>& info_log, const char* format, ...);
extern void Error(const shared_ptr<Logger>& info_log, const char* format, ...);
extern void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...);
// Log the specified data to *info_log if info_log is non-nullptr.
// The default info log level is InfoLogLevel::ERROR.
extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
# if defined(__GNUC__) || defined(__clang__)
__attribute__((__format__ (__printf__, 2, 3)))
# endif
;
extern void LogFlush(Logger *info_log);
extern void Log(const InfoLogLevel log_level, Logger* info_log,
const char* format, ...);
// The default info log level is InfoLogLevel::ERROR.
extern void Log(Logger* info_log, const char* format, ...)
# if defined(__GNUC__) || defined(__clang__)
__attribute__((__format__ (__printf__, 2, 3)))
# endif
;
// a set of log functions with different log levels.
extern void Debug(Logger* info_log, const char* format, ...);
extern void Info(Logger* info_log, const char* format, ...);
extern void Warn(Logger* info_log, const char* format, ...);
extern void Error(Logger* info_log, const char* format, ...);
extern void Fatal(Logger* info_log, const char* format, ...);
// A utility routine: write "data" to the named file.
extern Status WriteStringToFile(Env* env, const Slice& data,
const std::string& fname,
bool should_sync = false);
// A utility routine: read contents of named file into *data
extern Status ReadFileToString(Env* env, const std::string& fname,
std::string* data);
// An implementation of Env that forwards all calls to another Env.
// May be useful to clients who wish to override just part of the
// functionality of another Env.
class EnvWrapper : public Env {
public:
// Initialize an EnvWrapper that delegates all calls to *t
explicit EnvWrapper(Env* t) : target_(t) { }
virtual ~EnvWrapper();
// Return the target to which this Env forwards all calls
Env* target() const { return target_; }
// The following text is boilerplate that forwards all methods to target()
Status NewSequentialFile(const std::string& f,
unique_ptr<SequentialFile>* r,
const EnvOptions& options) {
return target_->NewSequentialFile(f, r, options);
}
Status NewRandomAccessFile(const std::string& f,
unique_ptr<RandomAccessFile>* r,
const EnvOptions& options) {
return target_->NewRandomAccessFile(f, r, options);
}
Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
const EnvOptions& options) {
return target_->NewWritableFile(f, r, options);
}
Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
const EnvOptions& options) {
return target_->NewRandomRWFile(f, r, options);
}
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) {
return target_->NewDirectory(name, result);
}
bool FileExists(const std::string& f) { return target_->FileExists(f); }
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
return target_->GetChildren(dir, r);
}
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
Status CreateDirIfMissing(const std::string& d) {
return target_->CreateDirIfMissing(d);
}
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
Status GetFileSize(const std::string& f, uint64_t* s) {
return target_->GetFileSize(f, s);
}
Status GetFileModificationTime(const std::string& fname,
uint64_t* file_mtime) {
return target_->GetFileModificationTime(fname, file_mtime);
}
Status RenameFile(const std::string& s, const std::string& t) {
return target_->RenameFile(s, t);
}
Status LockFile(const std::string& f, FileLock** l) {
return target_->LockFile(f, l);
}
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
void Schedule(void (*f)(void*), void* a, Priority pri) {
return target_->Schedule(f, a, pri);
}
void StartThread(void (*f)(void*), void* a) {
return target_->StartThread(f, a);
}
void WaitForJoin() { return target_->WaitForJoin(); }
virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
return target_->GetThreadPoolQueueLen(pri);
}
virtual Status GetTestDirectory(std::string* path) {
return target_->GetTestDirectory(path);
}
virtual Status NewLogger(const std::string& fname,
shared_ptr<Logger>* result) {
return target_->NewLogger(fname, result);
}
uint64_t NowMicros() {
return target_->NowMicros();
}
void SleepForMicroseconds(int micros) {
target_->SleepForMicroseconds(micros);
}
Status GetHostName(char* name, uint64_t len) {
return target_->GetHostName(name, len);
}
Status GetCurrentTime(int64_t* unix_time) {
return target_->GetCurrentTime(unix_time);
}
Status GetAbsolutePath(const std::string& db_path,
std::string* output_path) {
return target_->GetAbsolutePath(db_path, output_path);
}
void SetBackgroundThreads(int num, Priority pri) {
return target_->SetBackgroundThreads(num, pri);
}
std::string TimeToString(uint64_t time) {
return target_->TimeToString(time);
}
private:
Env* target_;
};
// Returns a new environment that stores its data in memory and delegates
// all non-file-storage tasks to base_env. The caller must delete the result
// when it is no longer needed.
// *base_env must remain live while the result is in use.
Env* NewMemEnv(Env* base_env);
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_

View File

@@ -0,0 +1,74 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A database can be configured with a custom FilterPolicy object.
// This object is responsible for creating a small filter from a set
// of keys. These filters are stored in rocksdb and are consulted
// automatically by rocksdb to decide whether or not to read some
// information from disk. In many cases, a filter can cut down the
// number of disk seeks form a handful to a single disk seek per
// DB::Get() call.
//
// Most people will want to use the builtin bloom filter support (see
// NewBloomFilterPolicy() below).
#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#include <string>
namespace rocksdb {
class Slice;
class FilterPolicy {
public:
virtual ~FilterPolicy();
// Return the name of this policy. Note that if the filter encoding
// changes in an incompatible way, the name returned by this method
// must be changed. Otherwise, old incompatible filters may be
// passed to methods of this type.
virtual const char* Name() const = 0;
// keys[0,n-1] contains a list of keys (potentially with duplicates)
// that are ordered according to the user supplied comparator.
// Append a filter that summarizes keys[0,n-1] to *dst.
//
// Warning: do not change the initial contents of *dst. Instead,
// append the newly constructed filter to *dst.
virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
const = 0;
// "filter" contains the data appended by a preceding call to
// CreateFilter() on this class. This method must return true if
// the key was in the list of keys passed to CreateFilter().
// This method may return true or false if the key was not on the
// list, but it should aim to return false with a high probability.
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
};
// Return a new filter policy that uses a bloom filter with approximately
// the specified number of bits per key. A good value for bits_per_key
// is 10, which yields a filter with ~ 1% false positive rate.
//
// Callers must delete the result after any database that is using the
// result has been closed.
//
// Note: if you are using a custom comparator that ignores some parts
// of the keys being compared, you must not use NewBloomFilterPolicy()
// and must provide your own FilterPolicy that also ignores the
// corresponding parts of the keys. For example, if the comparator
// ignores trailing spaces, it would be incorrect to use a
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
// trailing spaces in keys.
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
}
#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_

View File

@@ -0,0 +1,58 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <string>
namespace rocksdb {
class Slice;
class BlockBuilder;
struct Options;
// FlushBlockPolicy provides a configurable way to determine when to flush a
// block in the block based tables,
class FlushBlockPolicy {
public:
// Keep track of the key/value sequences and return the boolean value to
// determine if table builder should flush current data block.
virtual bool Update(const Slice& key,
const Slice& value) = 0;
virtual ~FlushBlockPolicy() { }
};
class FlushBlockPolicyFactory {
public:
// Return the name of the flush block policy.
virtual const char* Name() const = 0;
// Return a new block flush policy that flushes data blocks by data size.
// FlushBlockPolicy may need to access the metadata of the data block
// builder to determine when to flush the blocks.
//
// Callers must delete the result after any database that is using the
// result has been closed.
virtual FlushBlockPolicy* NewFlushBlockPolicy(
const Options& options, const BlockBuilder& data_block_builder) const = 0;
virtual ~FlushBlockPolicyFactory() { }
};
class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
public:
FlushBlockBySizePolicyFactory() {}
virtual const char* Name() const override {
return "FlushBlockBySizePolicyFactory";
}
virtual FlushBlockPolicy* NewFlushBlockPolicy(
const Options& options,
const BlockBuilder& data_block_builder) const override;
};
} // rocksdb

View File

@@ -0,0 +1,34 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
#define INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
#include <stdint.h>
#include <string>
// A thread local context for gathering io-stats efficiently and transparently.
namespace rocksdb {
struct IOStatsContext {
// reset all io-stats counter to zero
void Reset();
std::string ToString() const;
// the thread pool id
uint64_t thread_pool_id;
// number of bytes that has been written.
uint64_t bytes_written;
// number of bytes that has been read.
uint64_t bytes_read;
};
extern __thread IOStatsContext iostats_context;
} // namespace rocksdb
#endif // INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_

106
include/rocksdb/iterator.h Normal file
View File

@@ -0,0 +1,106 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An iterator yields a sequence of key/value pairs from a source.
// The following class defines the interface. Multiple implementations
// are provided by this library. In particular, iterators are provided
// to access the contents of a Table or a DB.
//
// Multiple threads can invoke const methods on an Iterator without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Iterator must use
// external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
namespace rocksdb {
class Iterator {
public:
Iterator();
virtual ~Iterator();
// An iterator is either positioned at a key/value pair, or
// not valid. This method returns true iff the iterator is valid.
virtual bool Valid() const = 0;
// Position at the first key in the source. The iterator is Valid()
// after this call iff the source is not empty.
virtual void SeekToFirst() = 0;
// Position at the last key in the source. The iterator is
// Valid() after this call iff the source is not empty.
virtual void SeekToLast() = 0;
// Position at the first key in the source that at or past target
// The iterator is Valid() after this call iff the source contains
// an entry that comes at or past target.
virtual void Seek(const Slice& target) = 0;
// Moves to the next entry in the source. After this call, Valid() is
// true iff the iterator was not positioned at the last entry in the source.
// REQUIRES: Valid()
virtual void Next() = 0;
// Moves to the previous entry in the source. After this call, Valid() is
// true iff the iterator was not positioned at the first entry in source.
// REQUIRES: Valid()
virtual void Prev() = 0;
// Return the key for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: Valid()
virtual Slice key() const = 0;
// Return the value for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: !AtEnd() && !AtStart()
virtual Slice value() const = 0;
// If an error has occurred, return it. Else return an ok status.
// If non-blocking IO is requested and this operation cannot be
// satisfied without doing some IO, then this returns Status::Incomplete().
virtual Status status() const = 0;
// Clients are allowed to register function/arg1/arg2 triples that
// will be invoked when this iterator is destroyed.
//
// Note that unlike all of the preceding methods, this method is
// not abstract and therefore clients should not override it.
typedef void (*CleanupFunction)(void* arg1, void* arg2);
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
private:
struct Cleanup {
CleanupFunction function;
void* arg1;
void* arg2;
Cleanup* next;
};
Cleanup cleanup_;
// No copying allowed
Iterator(const Iterator&);
void operator=(const Iterator&);
};
// Return an empty iterator (yields nothing).
extern Iterator* NewEmptyIterator();
// Return an empty iterator with the specified status.
extern Iterator* NewErrorIterator(const Status& status);
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_

View File

@@ -0,0 +1,37 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#pragma once
#include <string>
#include "rocksdb/options.h"
namespace rocksdb {
// An interface for converting a slice to a readable string
class SliceFormatter {
public:
virtual ~SliceFormatter() {}
virtual std::string Format(const Slice& s) const = 0;
};
// Options for customizing ldb tool (beyond the DB Options)
struct LDBOptions {
// Create LDBOptions with default values for all fields
LDBOptions();
// Key formatter that converts a slice to a readable string.
// Default: Slice::ToString()
std::shared_ptr<SliceFormatter> key_formatter;
};
class LDBTool {
public:
void Run(int argc, char** argv, Options db_options= Options(),
const LDBOptions& ldb_options = LDBOptions());
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

View File

@@ -0,0 +1,287 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// This file contains the interface that must be implemented by any collection
// to be used as the backing store for a MemTable. Such a collection must
// satisfy the following properties:
// (1) It does not store duplicate items.
// (2) It uses MemTableRep::KeyComparator to compare items for iteration and
// equality.
// (3) It can be accessed concurrently by multiple readers and can support
// during reads. However, it needn't support multiple concurrent writes.
// (4) Items are never deleted.
// The liberal use of assertions is encouraged to enforce (1).
//
// The factory will be passed an Arena object when a new MemTableRep is
// requested. The API for this object is in rocksdb/arena.h.
//
// Users can implement their own memtable representations. We include three
// types built in:
// - SkipListRep: This is the default; it is backed by a skip list.
// - HashSkipListRep: The memtable rep that is best used for keys that are
// structured like "prefix:suffix" where iteration within a prefix is
// common and iteration across different prefixes is rare. It is backed by
// a hash map where each bucket is a skip list.
// - VectorRep: This is backed by an unordered std::vector. On iteration, the
// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
// has been called, the vector will only be sorted once. It is optimized for
// random-write-heavy workloads.
//
// The last four implementations are designed for situations in which
// iteration over the entire collection is rare since doing so requires all the
// keys to be copied into a sorted data structure.
#pragma once
#include <memory>
#include <stdint.h>
namespace rocksdb {
class Arena;
class LookupKey;
class Slice;
class SliceTransform;
class Logger;
typedef void* KeyHandle;
class MemTableRep {
public:
// KeyComparator provides a means to compare keys, which are internal keys
// concatenated with values.
class KeyComparator {
public:
// Compare a and b. Return a negative value if a is less than b, 0 if they
// are equal, and a positive value if a is greater than b
virtual int operator()(const char* prefix_len_key1,
const char* prefix_len_key2) const = 0;
virtual int operator()(const char* prefix_len_key,
const Slice& key) const = 0;
virtual ~KeyComparator() { }
};
explicit MemTableRep(Arena* arena) : arena_(arena) {}
// Allocate a buf of len size for storing key. The idea is that a specific
// memtable representation knows its underlying data structure better. By
// allowing it to allocate memory, it can possibly put correlated stuff
// in consecutive memory area to make processor prefetching more efficient.
virtual KeyHandle Allocate(const size_t len, char** buf);
// Insert key into the collection. (The caller will pack key and value into a
// single buffer and pass that in as the parameter to Insert).
// REQUIRES: nothing that compares equal to key is currently in the
// collection.
virtual void Insert(KeyHandle handle) = 0;
// Returns true iff an entry that compares equal to key is in the collection.
virtual bool Contains(const char* key) const = 0;
// Notify this table rep that it will no longer be added to. By default, does
// nothing.
virtual void MarkReadOnly() { }
// Look up key from the mem table, since the first key in the mem table whose
// user_key matches the one given k, call the function callback_func(), with
// callback_args directly forwarded as the first parameter, and the mem table
// key as the second parameter. If the return value is false, then terminates.
// Otherwise, go through the next key.
// It's safe for Get() to terminate after having finished all the potential
// key for the k.user_key(), or not.
//
// Default:
// Get() function with a default value of dynamically construct an iterator,
// seek and call the call back function.
virtual void Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry));
// Report an approximation of how much memory has been used other than memory
// that was allocated through the arena.
virtual size_t ApproximateMemoryUsage() = 0;
virtual ~MemTableRep() { }
// Iteration over the contents of a skip collection
class Iterator {
public:
// Initialize an iterator over the specified collection.
// The returned iterator is not valid.
// explicit Iterator(const MemTableRep* collection);
virtual ~Iterator() {}
// Returns true iff the iterator is positioned at a valid node.
virtual bool Valid() const = 0;
// Returns the key at the current position.
// REQUIRES: Valid()
virtual const char* key() const = 0;
// Advances to the next position.
// REQUIRES: Valid()
virtual void Next() = 0;
// Advances to the previous position.
// REQUIRES: Valid()
virtual void Prev() = 0;
// Advance to the first entry with a key >= target
virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
// Position at the first entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
virtual void SeekToFirst() = 0;
// Position at the last entry in collection.
// Final state of iterator is Valid() iff collection is not empty.
virtual void SeekToLast() = 0;
};
// Return an iterator over the keys in this representation.
// arena: If not null, the arena needs to be used to allocate the Iterator.
// When destroying the iterator, the caller will not call "delete"
// but Iterator::~Iterator() directly. The destructor needs to destroy
// all the states but those allocated in arena.
virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
// Return an iterator that has a special Seek semantics. The result of
// a Seek might only include keys with the same prefix as the target key.
// arena: If not null, the arena needs to be used to allocate the Iterator.
// When destroying the iterator, the caller will not call "delete"
// but Iterator::~Iterator() directly. The destructor needs to destroy
// all the states but those allocated in arena.
virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
return GetIterator(arena);
}
// Return true if the current MemTableRep supports merge operator.
// Default: true
virtual bool IsMergeOperatorSupported() const { return true; }
// Return true if the current MemTableRep supports snapshot
// Default: true
virtual bool IsSnapshotSupported() const { return true; }
protected:
// When *key is an internal key concatenated with the value, returns the
// user key.
virtual Slice UserKey(const char* key) const;
Arena* arena_;
};
// This is the base class for all factories that are used by RocksDB to create
// new MemTableRep objects
class MemTableRepFactory {
public:
virtual ~MemTableRepFactory() {}
virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
Arena*, const SliceTransform*,
Logger* logger) = 0;
virtual const char* Name() const = 0;
};
// This uses a skip list to store keys. It is the default.
class SkipListFactory : public MemTableRepFactory {
public:
virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
Arena*, const SliceTransform*,
Logger* logger) override;
virtual const char* Name() const override { return "SkipListFactory"; }
};
#ifndef ROCKSDB_LITE
// This creates MemTableReps that are backed by an std::vector. On iteration,
// the vector is sorted. This is useful for workloads where iteration is very
// rare and writes are generally not issued after reads begin.
//
// Parameters:
// count: Passed to the constructor of the underlying std::vector of each
// VectorRep. On initialization, the underlying array will be at least count
// bytes reserved for usage.
class VectorRepFactory : public MemTableRepFactory {
const size_t count_;
public:
explicit VectorRepFactory(size_t count = 0) : count_(count) { }
virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
Arena*, const SliceTransform*,
Logger* logger) override;
virtual const char* Name() const override {
return "VectorRepFactory";
}
};
// This class contains a fixed array of buckets, each
// pointing to a skiplist (null if the bucket is empty).
// bucket_count: number of fixed array buckets
// skiplist_height: the max height of the skiplist
// skiplist_branching_factor: probabilistic size ratio between adjacent
// link lists in the skiplist
extern MemTableRepFactory* NewHashSkipListRepFactory(
size_t bucket_count = 1000000, int32_t skiplist_height = 4,
int32_t skiplist_branching_factor = 4
);
// The factory is to create memtables based on a hash table:
// it contains a fixed array of buckets, each pointing to either a linked list
// or a skip list if number of entries inside the bucket exceeds
// threshold_use_skiplist.
// @bucket_count: number of fixed array buckets
// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
// Otherwise from huge page TLB. The user needs to reserve
// huge pages for it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
// @bucket_entries_logging_threshold: if number of entries in one bucket
// exceeds this number, log about it.
// @if_log_bucket_dist_when_flash: if true, log distribution of number of
// entries when flushing.
// @threshold_use_skiplist: a bucket switches to skip list if number of
// entries exceed this parameter.
extern MemTableRepFactory* NewHashLinkListRepFactory(
size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
int bucket_entries_logging_threshold = 4096,
bool if_log_bucket_dist_when_flash = true,
uint32_t threshold_use_skiplist = 256);
// This factory creates a cuckoo-hashing based mem-table representation.
// Cuckoo-hash is a closed-hash strategy, in which all key/value pairs
// are stored in the bucket array itself intead of in some data structures
// external to the bucket array. In addition, each key in cuckoo hash
// has a constant number of possible buckets in the bucket array. These
// two properties together makes cuckoo hash more memory efficient and
// a constant worst-case read time. Cuckoo hash is best suitable for
// point-lookup workload.
//
// When inserting a key / value, it first checks whether one of its possible
// buckets is empty. If so, the key / value will be inserted to that vacant
// bucket. Otherwise, one of the keys originally stored in one of these
// possible buckets will be "kicked out" and move to one of its possible
// buckets (and possibly kicks out another victim.) In the current
// implementation, such "kick-out" path is bounded. If it cannot find a
// "kick-out" path for a specific key, this key will be stored in a backup
// structure, and the current memtable to be forced to immutable.
//
// Note that currently this mem-table representation does not support
// snapshot (i.e., it only queries latest state) and iterators. In addition,
// MultiGet operation might also lose its atomicity due to the lack of
// snapshot support.
//
// Parameters:
// write_buffer_size: the write buffer size in bytes.
// average_data_size: the average size of key + value in bytes. This value
// together with write_buffer_size will be used to compute the number
// of buckets.
// hash_function_count: the number of hash functions that will be used by
// the cuckoo-hash. The number also equals to the number of possible
// buckets each key will have.
extern MemTableRepFactory* NewHashCuckooRepFactory(
size_t write_buffer_size, size_t average_data_size = 64,
unsigned int hash_function_count = 4);
#endif // ROCKSDB_LITE
} // namespace rocksdb

View File

@@ -0,0 +1,182 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
#include <memory>
#include <string>
#include <deque>
#include "rocksdb/slice.h"
namespace rocksdb {
class Slice;
class Logger;
// The Merge Operator
//
// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
// client knows. It could be numeric addition, list append, string
// concatenation, edit data structure, ... , anything.
// The library, on the other hand, is concerned with the exercise of this
// interface, at the right time (during get, iteration, compaction...)
//
// To use merge, the client needs to provide an object implementing one of
// the following interfaces:
// a) AssociativeMergeOperator - for most simple semantics (always take
// two values, and merge them into one value, which is then put back
// into rocksdb); numeric addition and string concatenation are examples;
//
// b) MergeOperator - the generic class for all the more abstract / complex
// operations; one method (FullMerge) to merge a Put/Delete value with a
// merge operand; and another method (PartialMerge) that merges multiple
// operands together. this is especially useful if your key values have
// complex structures but you would still like to support client-specific
// incremental updates.
//
// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
// more powerful.
//
// Refer to rocksdb-merge wiki for more details and example implementations.
//
class MergeOperator {
public:
virtual ~MergeOperator() {}
// Gives the client a way to express the read -> modify -> write semantics
// key: (IN) The key that's associated with this merge operation.
// Client could multiplex the merge operator based on it
// if the key space is partitioned and different subspaces
// refer to different types of data which have different
// merge operation semantics
// existing: (IN) null indicates that the key does not exist before this op
// operand_list:(IN) the sequence of merge operations to apply, front() first.
// new_value:(OUT) Client is responsible for filling the merge result here
// logger: (IN) Client could use this to log errors during merge.
//
// Return true on success.
// All values passed in will be client-specific values. So if this method
// returns false, it is because client specified bad data or there was
// internal corruption. This will be treated as an error by the library.
//
// Also make use of the *logger for error messages.
virtual bool FullMerge(const Slice& key,
const Slice* existing_value,
const std::deque<std::string>& operand_list,
std::string* new_value,
Logger* logger) const = 0;
// This function performs merge(left_op, right_op)
// when both the operands are themselves merge operation types
// that you would have passed to a DB::Merge() call in the same order
// (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
//
// PartialMerge should combine them into a single merge operation that is
// saved into *new_value, and then it should return true.
// *new_value should be constructed such that a call to
// DB::Merge(key, *new_value) would yield the same result as a call
// to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
//
// The default implementation of PartialMergeMulti will use this function
// as a helper, for backward compatibility. Any successor class of
// MergeOperator should either implement PartialMerge or PartialMergeMulti,
// although implementing PartialMergeMulti is suggested as it is in general
// more effective to merge multiple operands at a time instead of two
// operands at a time.
//
// If it is impossible or infeasible to combine the two operations,
// leave new_value unchanged and return false. The library will
// internally keep track of the operations, and apply them in the
// correct order once a base-value (a Put/Delete/End-of-Database) is seen.
//
// TODO: Presently there is no way to differentiate between error/corruption
// and simply "return false". For now, the client should simply return
// false in any case it cannot perform partial-merge, regardless of reason.
// If there is corruption in the data, handle it in the FullMerge() function,
// and return false there. The default implementation of PartialMerge will
// always return false.
virtual bool PartialMerge(const Slice& key, const Slice& left_operand,
const Slice& right_operand, std::string* new_value,
Logger* logger) const {
return false;
}
// This function performs merge when all the operands are themselves merge
// operation types that you would have passed to a DB::Merge() call in the
// same order (front() first)
// (i.e. DB::Merge(key, operand_list[0]), followed by
// DB::Merge(key, operand_list[1]), ...)
//
// PartialMergeMulti should combine them into a single merge operation that is
// saved into *new_value, and then it should return true. *new_value should
// be constructed such that a call to DB::Merge(key, *new_value) would yield
// the same result as subquential individual calls to DB::Merge(key, operand)
// for each operand in operand_list from front() to back().
//
// The PartialMergeMulti function will be called only when the list of
// operands are long enough. The minimum amount of operands that will be
// passed to the function are specified by the "min_partial_merge_operands"
// option.
//
// In the default implementation, PartialMergeMulti will invoke PartialMerge
// multiple times, where each time it only merges two operands. Developers
// should either implement PartialMergeMulti, or implement PartialMerge which
// is served as the helper function of the default PartialMergeMulti.
virtual bool PartialMergeMulti(const Slice& key,
const std::deque<Slice>& operand_list,
std::string* new_value, Logger* logger) const;
// The name of the MergeOperator. Used to check for MergeOperator
// mismatches (i.e., a DB created with one MergeOperator is
// accessed using a different MergeOperator)
// TODO: the name is currently not stored persistently and thus
// no checking is enforced. Client is responsible for providing
// consistent MergeOperator between DB opens.
virtual const char* Name() const = 0;
};
// The simpler, associative merge operator.
class AssociativeMergeOperator : public MergeOperator {
public:
virtual ~AssociativeMergeOperator() {}
// Gives the client a way to express the read -> modify -> write semantics
// key: (IN) The key that's associated with this merge operation.
// existing_value:(IN) null indicates the key does not exist before this op
// value: (IN) the value to update/merge the existing_value with
// new_value: (OUT) Client is responsible for filling the merge result here
// logger: (IN) Client could use this to log errors during merge.
//
// Return true on success.
// All values passed in will be client-specific values. So if this method
// returns false, it is because client specified bad data or there was
// internal corruption. The client should assume that this will be treated
// as an error by the library.
virtual bool Merge(const Slice& key,
const Slice* existing_value,
const Slice& value,
std::string* new_value,
Logger* logger) const = 0;
private:
// Default implementations of the MergeOperator functions
virtual bool FullMerge(const Slice& key,
const Slice* existing_value,
const std::deque<std::string>& operand_list,
std::string* new_value,
Logger* logger) const override;
virtual bool PartialMerge(const Slice& key,
const Slice& left_operand,
const Slice& right_operand,
std::string* new_value,
Logger* logger) const override;
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_

1043
include/rocksdb/options.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,78 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
#include <stdint.h>
#include <string>
namespace rocksdb {
enum PerfLevel {
kDisable = 0, // disable perf stats
kEnableCount = 1, // enable only count stats
kEnableTime = 2 // enable time stats too
};
// set the perf stats level
void SetPerfLevel(PerfLevel level);
// get current perf stats level
PerfLevel GetPerfLevel();
// A thread local context for gathering performance counter efficiently
// and transparently.
struct PerfContext {
void Reset(); // reset all performance counters to zero
std::string ToString() const;
uint64_t user_key_comparison_count; // total number of user key comparisons
uint64_t block_cache_hit_count; // total number of block cache hits
uint64_t block_read_count; // total number of block reads (with IO)
uint64_t block_read_byte; // total number of bytes from block reads
uint64_t block_read_time; // total time spent on block reads
uint64_t block_checksum_time; // total time spent on block checksum
uint64_t block_decompress_time; // total time spent on block decompression
// total number of internal keys skipped over during iteration (overwritten or
// deleted, to be more specific, hidden by a put or delete of the same key)
uint64_t internal_key_skipped_count;
// total number of deletes skipped over during iteration
uint64_t internal_delete_skipped_count;
uint64_t get_snapshot_time; // total time spent on getting snapshot
uint64_t get_from_memtable_time; // total time spent on querying memtables
uint64_t get_from_memtable_count; // number of mem tables queried
// total time spent after Get() finds a key
uint64_t get_post_process_time;
uint64_t get_from_output_files_time; // total time reading from output files
// total time spent on seeking child iters
uint64_t seek_child_seek_time;
// number of seek issued in child iterators
uint64_t seek_child_seek_count;
uint64_t seek_min_heap_time; // total time spent on the merge heap
// total time spent on seeking the internal entries
uint64_t seek_internal_seek_time;
// total time spent on iterating internal entries to find the next user entry
uint64_t find_next_user_entry_time;
// total time spent on pre or post processing when writing a record
uint64_t write_pre_and_post_process_time;
uint64_t write_wal_time; // total time spent on writing to WAL
// total time spent on writing to mem tables
uint64_t write_memtable_time;
};
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
extern PerfContext perf_context;
#else
extern __thread PerfContext perf_context;
#endif
}
#endif

View File

@@ -0,0 +1,60 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "rocksdb/env.h"
namespace rocksdb {
class RateLimiter {
public:
virtual ~RateLimiter() {}
// Request for token to write bytes. If this request can not be satisfied,
// the call is blocked. Caller is responsible to make sure
// bytes < GetSingleBurstBytes()
virtual void Request(const int64_t bytes, const Env::IOPriority pri) = 0;
// Max bytes can be granted in a single burst
virtual int64_t GetSingleBurstBytes() const = 0;
// Total bytes that go though rate limiter
virtual int64_t GetTotalBytesThrough(
const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
// Total # of requests that go though rate limiter
virtual int64_t GetTotalRequests(
const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
};
// Create a RateLimiter object, which can be shared among RocksDB instances to
// control write rate of flush and compaction.
// @rate_bytes_per_sec: this is the only parameter you want to set most of the
// time. It controls the total write rate of compaction and flush in bytes per
// second. Currently, RocksDB does not enforce rate limit for anything other
// than flush and compaction, e.g. write to WAL.
// @refill_period_us: this controls how often tokens are refilled. For example,
// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to
// burstier writes while smaller value introduces more CPU overhead.
// The default should work for most cases.
// @fairness: RateLimiter accepts high-pri requests and low-pri requests.
// A low-pri request is usually blocked in favor of hi-pri request. Currently,
// RocksDB assigns low-pri to request from compaciton and high-pri to request
// from flush. Low-pri requests can get blocked if flush requests come in
// continuouly. This fairness parameter grants low-pri requests permission by
// 1/fairness chance even though high-pri requests exist to avoid starvation.
// You should be good by leaving it at default 10.
extern RateLimiter* NewGenericRateLimiter(
int64_t rate_bytes_per_sec,
int64_t refill_period_us = 100 * 1000,
int32_t fairness = 10);
} // namespace rocksdb

137
include/rocksdb/slice.h Normal file
View File

@@ -0,0 +1,137 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Slice is a simple structure containing a pointer into some external
// storage and a size. The user of a Slice must ensure that the slice
// is not used after the corresponding external storage has been
// deallocated.
//
// Multiple threads can invoke const methods on a Slice without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Slice must use
// external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_
#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include <string>
namespace rocksdb {
class Slice {
public:
// Create an empty slice.
Slice() : data_(""), size_(0) { }
// Create a slice that refers to d[0,n-1].
Slice(const char* d, size_t n) : data_(d), size_(n) { }
// Create a slice that refers to the contents of "s"
/* implicit */
Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
// Create a slice that refers to s[0,strlen(s)-1]
/* implicit */
Slice(const char* s) : data_(s), size_(strlen(s)) { }
// Return a pointer to the beginning of the referenced data
const char* data() const { return data_; }
// Return the length (in bytes) of the referenced data
size_t size() const { return size_; }
// Return true iff the length of the referenced data is zero
bool empty() const { return size_ == 0; }
// Return the ith byte in the referenced data.
// REQUIRES: n < size()
char operator[](size_t n) const {
assert(n < size());
return data_[n];
}
// Change this slice to refer to an empty array
void clear() { data_ = ""; size_ = 0; }
// Drop the first "n" bytes from this slice.
void remove_prefix(size_t n) {
assert(n <= size());
data_ += n;
size_ -= n;
}
// Return a string that contains the copy of the referenced data.
std::string ToString(bool hex = false) const {
if (hex) {
std::string result;
char buf[10];
for (size_t i = 0; i < size_; i++) {
snprintf(buf, 10, "%02X", (unsigned char)data_[i]);
result += buf;
}
return result;
} else {
return std::string(data_, size_);
}
}
// Three-way comparison. Returns value:
// < 0 iff "*this" < "b",
// == 0 iff "*this" == "b",
// > 0 iff "*this" > "b"
int compare(const Slice& b) const;
// Return true iff "x" is a prefix of "*this"
bool starts_with(const Slice& x) const {
return ((size_ >= x.size_) &&
(memcmp(data_, x.data_, x.size_) == 0));
}
// private: make these public for rocksdbjni access
const char* data_;
size_t size_;
// Intentionally copyable
};
// A set of Slices that are virtually concatenated together. 'parts' points
// to an array of Slices. The number of elements in the array is 'num_parts'.
struct SliceParts {
SliceParts(const Slice* _parts, int _num_parts) :
parts(_parts), num_parts(_num_parts) { }
SliceParts() : parts(nullptr), num_parts(0) {}
const Slice* parts;
int num_parts;
};
inline bool operator==(const Slice& x, const Slice& y) {
return ((x.size() == y.size()) &&
(memcmp(x.data(), y.data(), x.size()) == 0));
}
inline bool operator!=(const Slice& x, const Slice& y) {
return !(x == y);
}
inline int Slice::compare(const Slice& b) const {
const int min_len = (size_ < b.size_) ? size_ : b.size_;
int r = memcmp(data_, b.data_, min_len);
if (r == 0) {
if (size_ < b.size_) r = -1;
else if (size_ > b.size_) r = +1;
}
return r;
}
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_H_

View File

@@ -0,0 +1,47 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Class for specifying user-defined functions which perform a
// transformation on a slice. It is not required that every slice
// belong to the domain and/or range of a function. Subclasses should
// define InDomain and InRange to determine which slices are in either
// of these sets respectively.
#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
#include <string>
namespace rocksdb {
class Slice;
class SliceTransform {
public:
virtual ~SliceTransform() {};
// Return the name of this transformation.
virtual const char* Name() const = 0;
// transform a src in domain to a dst in the range
virtual Slice Transform(const Slice& src) const = 0;
// determine whether this is a valid src upon the function applies
virtual bool InDomain(const Slice& src) const = 0;
// determine whether dst=Transform(src) for some src
virtual bool InRange(const Slice& dst) const = 0;
};
extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
extern const SliceTransform* NewNoopTransform();
}
#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_

View File

@@ -0,0 +1,281 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <string>
#include <memory>
#include <vector>
namespace rocksdb {
/**
* Keep adding ticker's here.
* 1. Any ticker should be added before TICKER_ENUM_MAX.
* 2. Add a readable string in TickersNameMap below for the newly added ticker.
*/
enum Tickers : uint32_t {
// total block cache misses
// REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
// BLOCK_CACHE_FILTER_MISS +
// BLOCK_CACHE_DATA_MISS;
BLOCK_CACHE_MISS = 0,
// total block cache hit
// REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
// BLOCK_CACHE_FILTER_HIT +
// BLOCK_CACHE_DATA_HIT;
BLOCK_CACHE_HIT,
// # of blocks added to block cache.
BLOCK_CACHE_ADD,
// # of times cache miss when accessing index block from block cache.
BLOCK_CACHE_INDEX_MISS,
// # of times cache hit when accessing index block from block cache.
BLOCK_CACHE_INDEX_HIT,
// # of times cache miss when accessing filter block from block cache.
BLOCK_CACHE_FILTER_MISS,
// # of times cache hit when accessing filter block from block cache.
BLOCK_CACHE_FILTER_HIT,
// # of times cache miss when accessing data block from block cache.
BLOCK_CACHE_DATA_MISS,
// # of times cache hit when accessing data block from block cache.
BLOCK_CACHE_DATA_HIT,
// # of times bloom filter has avoided file reads.
BLOOM_FILTER_USEFUL,
// # of memtable hits.
MEMTABLE_HIT,
// # of memtable misses.
MEMTABLE_MISS,
/**
* COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
* There are 3 reasons currently.
*/
COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
// Number of keys written to the database via the Put and Write call's
NUMBER_KEYS_WRITTEN,
// Number of Keys read,
NUMBER_KEYS_READ,
// Number keys updated, if inplace update is enabled
NUMBER_KEYS_UPDATED,
// Bytes written / read
BYTES_WRITTEN,
BYTES_READ,
NO_FILE_CLOSES,
NO_FILE_OPENS,
NO_FILE_ERRORS,
// Time system had to wait to do LO-L1 compactions
STALL_L0_SLOWDOWN_MICROS,
// Time system had to wait to move memtable to L1.
STALL_MEMTABLE_COMPACTION_MICROS,
// write throttle because of too many files in L0
STALL_L0_NUM_FILES_MICROS,
RATE_LIMIT_DELAY_MILLIS,
NO_ITERATORS, // number of iterators currently open
// Number of MultiGet calls, keys read, and bytes read
NUMBER_MULTIGET_CALLS,
NUMBER_MULTIGET_KEYS_READ,
NUMBER_MULTIGET_BYTES_READ,
// Number of deletes records that were not required to be
// written to storage because key does not exist
NUMBER_FILTERED_DELETES,
NUMBER_MERGE_FAILURES,
SEQUENCE_NUMBER,
// number of times bloom was checked before creating iterator on a
// file, and the number of times the check was useful in avoiding
// iterator creation (and thus likely IOPs).
BLOOM_FILTER_PREFIX_CHECKED,
BLOOM_FILTER_PREFIX_USEFUL,
// Number of times we had to reseek inside an iteration to skip
// over large number of keys with same userkey.
NUMBER_OF_RESEEKS_IN_ITERATION,
// Record the number of calls to GetUpadtesSince. Useful to keep track of
// transaction log iterator refreshes
GET_UPDATES_SINCE_CALLS,
BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
WAL_FILE_SYNCED, // Number of times WAL sync is done
WAL_FILE_BYTES, // Number of bytes written to WAL
// Writes can be processed by requesting thread or by the thread at the
// head of the writers queue.
WRITE_DONE_BY_SELF,
WRITE_DONE_BY_OTHER,
WRITE_TIMEDOUT, // Number of writes ending up with timed-out.
WRITE_WITH_WAL, // Number of Write calls that request WAL
COMPACT_READ_BYTES, // Bytes read during compaction
COMPACT_WRITE_BYTES, // Bytes written during compaction
FLUSH_WRITE_BYTES, // Bytes written during flush
// Number of table's properties loaded directly from file, without creating
// table reader object.
NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
NUMBER_SUPERVERSION_ACQUIRES,
NUMBER_SUPERVERSION_RELEASES,
NUMBER_SUPERVERSION_CLEANUPS,
NUMBER_BLOCK_NOT_COMPRESSED,
TICKER_ENUM_MAX
};
// The order of items listed in Tickers should be the same as
// the order listed in TickersNameMap
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
{BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
{BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
{BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
{BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
{BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
{BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
{BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
{BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
{BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
{MEMTABLE_HIT, "rocksdb.memtable.hit"},
{MEMTABLE_MISS, "rocksdb.memtable.miss"},
{COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
{COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
{COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
{NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
{NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
{NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
{BYTES_WRITTEN, "rocksdb.bytes.written"},
{BYTES_READ, "rocksdb.bytes.read"},
{NO_FILE_CLOSES, "rocksdb.no.file.closes"},
{NO_FILE_OPENS, "rocksdb.no.file.opens"},
{NO_FILE_ERRORS, "rocksdb.no.file.errors"},
{STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
{STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
{STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
{RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
{NO_ITERATORS, "rocksdb.num.iterators"},
{NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
{NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
{NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
{NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
{NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
{SEQUENCE_NUMBER, "rocksdb.sequence.number"},
{BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
{BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
{NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
{GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
{BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
{BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
{WAL_FILE_SYNCED, "rocksdb.wal.synced"},
{WAL_FILE_BYTES, "rocksdb.wal.bytes"},
{WRITE_DONE_BY_SELF, "rocksdb.write.self"},
{WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
{WRITE_TIMEDOUT, "rocksdb.write.timedout"},
{WRITE_WITH_WAL, "rocksdb.write.wal"},
{FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
{COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
{COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
{NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
"rocksdb.number.direct.load.table.properties"},
{NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
{NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
{NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
{NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
};
/**
* Keep adding histogram's here.
* Any histogram whould have value less than HISTOGRAM_ENUM_MAX
* Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
* Add a string representation in HistogramsNameMap below
* And increment HISTOGRAM_ENUM_MAX
*/
enum Histograms : uint32_t {
DB_GET = 0,
DB_WRITE,
COMPACTION_TIME,
TABLE_SYNC_MICROS,
COMPACTION_OUTFILE_SYNC_MICROS,
WAL_FILE_SYNC_MICROS,
MANIFEST_FILE_SYNC_MICROS,
// TIME SPENT IN IO DURING TABLE OPEN
TABLE_OPEN_IO_MICROS,
DB_MULTIGET,
READ_BLOCK_COMPACTION_MICROS,
READ_BLOCK_GET_MICROS,
WRITE_RAW_BLOCK_MICROS,
STALL_L0_SLOWDOWN_COUNT,
STALL_MEMTABLE_COMPACTION_COUNT,
STALL_L0_NUM_FILES_COUNT,
HARD_RATE_LIMIT_DELAY_COUNT,
SOFT_RATE_LIMIT_DELAY_COUNT,
NUM_FILES_IN_SINGLE_COMPACTION,
HISTOGRAM_ENUM_MAX,
};
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{ DB_GET, "rocksdb.db.get.micros" },
{ DB_WRITE, "rocksdb.db.write.micros" },
{ COMPACTION_TIME, "rocksdb.compaction.times.micros" },
{ TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" },
{ COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" },
{ WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" },
{ MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" },
{ TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" },
{ DB_MULTIGET, "rocksdb.db.multiget.micros" },
{ READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" },
{ READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" },
{ WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" },
{ STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
{ STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
{ STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
{ HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
{ SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
{ NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
};
struct HistogramData {
double median;
double percentile95;
double percentile99;
double average;
double standard_deviation;
};
// Analyze the performance of a db
class Statistics {
public:
virtual ~Statistics() {}
virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
virtual void histogramData(uint32_t type,
HistogramData* const data) const = 0;
virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
virtual void measureTime(uint32_t histogramType, uint64_t time) = 0;
// String representation of the statistic object.
virtual std::string ToString() const = 0;
// Override this function to disable particular histogram collection
virtual bool HistEnabledForType(uint32_t type) const {
return type < HISTOGRAM_ENUM_MAX;
}
};
// Create a concrete DBStatistics object
std::shared_ptr<Statistics> CreateDBStatistics();
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_

154
include/rocksdb/status.h Normal file
View File

@@ -0,0 +1,154 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A Status encapsulates the result of an operation. It may indicate success,
// or it may indicate an error with an associated error message.
//
// Multiple threads can invoke const methods on a Status without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Status must use
// external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_
#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_
#include <string>
#include "rocksdb/slice.h"
namespace rocksdb {
class Status {
public:
// Create a success status.
Status() : code_(kOk), state_(nullptr) { }
~Status() { delete[] state_; }
// Copy the specified status.
Status(const Status& s);
void operator=(const Status& s);
// Return a success status.
static Status OK() { return Status(); }
// Return error status of an appropriate type.
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kNotFound, msg, msg2);
}
// Fast path for not found without malloc;
static Status NotFound() {
return Status(kNotFound);
}
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kCorruption, msg, msg2);
}
static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kNotSupported, msg, msg2);
}
static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kInvalidArgument, msg, msg2);
}
static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kIOError, msg, msg2);
}
static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kMergeInProgress, msg, msg2);
}
static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kIncomplete, msg, msg2);
}
static Status ShutdownInProgress(const Slice& msg,
const Slice& msg2 = Slice()) {
return Status(kShutdownInProgress, msg, msg2);
}
static Status TimedOut() {
return Status(kTimedOut);
}
static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kTimedOut, msg, msg2);
}
// Returns true iff the status indicates success.
bool ok() const { return code() == kOk; }
// Returns true iff the status indicates a NotFound error.
bool IsNotFound() const { return code() == kNotFound; }
// Returns true iff the status indicates a Corruption error.
bool IsCorruption() const { return code() == kCorruption; }
// Returns true iff the status indicates a NotSupported error.
bool IsNotSupported() const { return code() == kNotSupported; }
// Returns true iff the status indicates an InvalidArgument error.
bool IsInvalidArgument() const { return code() == kInvalidArgument; }
// Returns true iff the status indicates an IOError.
bool IsIOError() const { return code() == kIOError; }
// Returns true iff the status indicates an MergeInProgress.
bool IsMergeInProgress() const { return code() == kMergeInProgress; }
// Returns true iff the status indicates Incomplete
bool IsIncomplete() const { return code() == kIncomplete; }
// Returns true iff the status indicates Incomplete
bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }
bool IsTimedOut() const { return code() == kTimedOut; }
// Return a string representation of this status suitable for printing.
// Returns the string "OK" for success.
std::string ToString() const;
enum Code {
kOk = 0,
kNotFound = 1,
kCorruption = 2,
kNotSupported = 3,
kInvalidArgument = 4,
kIOError = 5,
kMergeInProgress = 6,
kIncomplete = 7,
kShutdownInProgress = 8,
kTimedOut = 9
};
Code code() const {
return code_;
}
private:
// A nullptr state_ (which is always the case for OK) means the message
// is empty.
// of the following form:
// state_[0..3] == length of message
// state_[4..] == message
Code code_;
const char* state_;
explicit Status(Code code) : code_(code), state_(nullptr) { }
Status(Code code, const Slice& msg, const Slice& msg2);
static const char* CopyState(const char* s);
};
inline Status::Status(const Status& s) {
code_ = s.code_;
state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
}
inline void Status::operator=(const Status& s) {
// The following condition catches both aliasing (when this == &s),
// and the common case where both s and *this are ok.
code_ = s.code_;
if (state_ != s.state_) {
delete[] state_;
state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
}
}
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_STATUS_H_

270
include/rocksdb/table.h Normal file
View File

@@ -0,0 +1,270 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Currently we support two types of tables: plain table and block-based table.
// 1. Block-based table: this is the default table type that we inherited from
// LevelDB, which was designed for storing data in hard disk or flash
// device.
// 2. Plain table: it is one of RocksDB's SST file format optimized
// for low query latency on pure-memory or really low-latency media.
//
// A tutorial of rocksdb table formats is available here:
// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
//
// Example code is also available
// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "rocksdb/status.h"
namespace rocksdb {
// -- Block-based Table
class FlushBlockPolicyFactory;
class RandomAccessFile;
class TableBuilder;
class TableReader;
class WritableFile;
struct EnvOptions;
struct Options;
using std::unique_ptr;
enum ChecksumType : char {
kNoChecksum = 0x0, // not yet supported. Will fail
kCRC32c = 0x1,
kxxHash = 0x2,
};
// For advanced user only
struct BlockBasedTableOptions {
// @flush_block_policy_factory creates the instances of flush block policy.
// which provides a configurable way to determine when to flush a block in
// the block based tables. If not set, table builder will use the default
// block flush policy, which cut blocks by block size (please refer to
// `FlushBlockBySizePolicy`).
std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
// TODO(kailiu) Temporarily disable this feature by making the default value
// to be false.
//
// Indicating if we'd put index/filter blocks to the block cache.
// If not specified, each "table reader" object will pre-load index/filter
// block during table initialization.
bool cache_index_and_filter_blocks = false;
// The index type that will be used for this table.
enum IndexType : char {
// A space efficient index block that is optimized for
// binary-search-based index.
kBinarySearch,
// The hash index, if enabled, will do the hash lookup when
// `Options.prefix_extractor` is provided.
kHashSearch,
};
IndexType index_type = kBinarySearch;
// Influence the behavior when kHashSearch is used.
// if false, stores a precise prefix to block range mapping
// if true, does not store prefix and allows prefix hash collision
// (less memory consumption)
bool hash_index_allow_collision = true;
// Use the specified checksum type. Newly created table files will be
// protected with this checksum type. Old table files will still be readable,
// even though they have different checksum type.
ChecksumType checksum = kCRC32c;
};
// Table Properties that are specific to block-based table properties.
struct BlockBasedTablePropertyNames {
// value of this propertis is a fixed int32 number.
static const std::string kIndexType;
};
// Create default block based table factory.
extern TableFactory* NewBlockBasedTableFactory(
const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
#ifndef ROCKSDB_LITE
enum EncodingType : char {
// Always write full keys without any special encoding.
kPlain,
// Find opportunity to write the same prefix once for multiple rows.
// In some cases, when a key follows a previous key with the same prefix,
// instead of writing out the full key, it just writes out the size of the
// shared prefix, as well as other bytes, to save some bytes.
//
// When using this option, the user is required to use the same prefix
// extractor to make sure the same prefix will be extracted from the same key.
// The Name() value of the prefix extractor will be stored in the file. When
// reopening the file, the name of the options.prefix_extractor given will be
// bitwise compared to the prefix extractors stored in the file. An error
// will be returned if the two don't match.
kPrefix,
};
// Table Properties that are specific to plain table properties.
struct PlainTablePropertyNames {
static const std::string kPrefixExtractorName;
static const std::string kEncodingType;
static const std::string kBloomVersion;
static const std::string kNumBloomBlocks;
};
const uint32_t kPlainTableVariableLength = 0;
struct PlainTableOptions {
// @user_key_len: plain table has optimization for fix-sized keys, which can be
// specified via user_key_len. Alternatively, you can pass
// `kPlainTableVariableLength` if your keys have variable
// lengths.
uint32_t user_key_len = kPlainTableVariableLength;
// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You
// may disable it by passing a zero.
int bloom_bits_per_key = 10;
// @hash_table_ratio: the desired utilization of the hash table used for prefix
// hashing. hash_table_ratio = number of prefixes / #buckets
// in the hash table
double hash_table_ratio = 0.75;
// @index_sparseness: inside each prefix, need to build one index record for how
// many keys for binary search inside each hash bucket.
// For encoding type kPrefix, the value will be used when
// writing to determine an interval to rewrite the full key.
// It will also be used as a suggestion and satisfied when
// possible.
size_t index_sparseness = 16;
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
// Otherwise from huge page TLB. The user needs to reserve
// huge pages for it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
size_t huge_page_tlb_size = 0;
// @encoding_type: how to encode the keys. See enum EncodingType above for
// the choices. The value will determine how to encode keys
// when writing to a new SST file. This value will be stored
// inside the SST file which will be used when reading from the
// file, which makes it possible for users to choose different
// encoding type when reopening a DB. Files with different
// encoding types can co-exist in the same DB and can be read.
EncodingType encoding_type = kPlain;
// @full_scan_mode: mode for reading the whole file one record by one without
// using the index.
bool full_scan_mode = false;
// @store_index_in_file: compute plain table index and bloom filter during
// file building and store it in file. When reading
// file, index will be mmaped instead of recomputation.
bool store_index_in_file = false;
};
// -- Plain Table with prefix-only seek
// For this factory, you need to set Options.prefix_extrator properly to make it
// work. Look-up will starts with prefix hash lookup for key prefix. Inside the
// hash bucket found, a binary search is executed for hash conflicts. Finally,
// a linear search is used.
extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options =
PlainTableOptions());
struct CuckooTablePropertyNames {
static const std::string kEmptyKey;
static const std::string kValueLength;
static const std::string kNumHashTable;
static const std::string kMaxNumBuckets;
static const std::string kIsLastLevel;
};
#endif // ROCKSDB_LITE
// A base class for table factories.
class TableFactory {
public:
virtual ~TableFactory() {}
// The type of the table.
//
// The client of this package should switch to a new name whenever
// the table format implementation changes.
//
// Names starting with "rocksdb." are reserved and should not be used
// by any clients of this package.
virtual const char* Name() const = 0;
// Returns a Table object table that can fetch data from file specified
// in parameter file. It's the caller's responsibility to make sure
// file is in the correct format.
//
// NewTableReader() is called in two places:
// (1) TableCache::FindTable() calls the function when table cache miss
// and cache the table object returned.
// (1) SstFileReader (for SST Dump) opens the table and dump the table
// contents using the interator of the table.
// options and soptions are options. options is the general options.
// Multiple configured can be accessed from there, including and not
// limited to block cache and key comparators.
// file is a file handler to handle the file for the table
// file_size is the physical file size of the file
// table_reader is the output table reader
virtual Status NewTableReader(
const Options& options, const EnvOptions& soptions,
const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const = 0;
// Return a table builder to write to a file for this table type.
//
// It is called in several places:
// (1) When flushing memtable to a level-0 output file, it creates a table
// builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
// (2) During compaction, it gets the builder for writing compaction output
// files in DBImpl::OpenCompactionOutputFile().
// (3) When recovering from transaction logs, it creates a table builder to
// write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
// by calling BuildTable())
// (4) When running Repairer, it creates a table builder to convert logs to
// SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
//
// options is the general options. Multiple configured can be acceseed from
// there, including and not limited to compression options.
// file is a handle of a writable file. It is the caller's responsibility to
// keep the file open and close the file after closing the table builder.
// compression_type is the compression type to use in this table.
virtual TableBuilder* NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_comparator,
WritableFile* file, CompressionType compression_type) const = 0;
};
#ifndef ROCKSDB_LITE
// Create a special table factory that can open both of block based table format
// and plain table, based on setting inside the SST files. It should be used to
// convert a DB from one table format to another.
// @table_factory_to_write: the table factory used when writing to new files.
// @block_based_table_factory: block based table factory to use. If NULL, use
// a default one.
// @plain_table_factory: plain table factory to use. If NULL, use a default one.
extern TableFactory* NewAdaptiveTableFactory(
std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
std::shared_ptr<TableFactory> plain_table_factory = nullptr);
#endif // ROCKSDB_LITE
} // namespace rocksdb

View File

@@ -0,0 +1,127 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <string>
#include <map>
#include "rocksdb/status.h"
namespace rocksdb {
// -- Table Properties
// Other than basic table properties, each table may also have the user
// collected properties.
// The value of the user-collected properties are encoded as raw bytes --
// users have to interprete these values by themselves.
// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do
// something similar to:
//
// UserCollectedProperties props = ...;
// for (auto pos = props.lower_bound(prefix);
// pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0;
// ++pos) {
// ...
// }
typedef std::map<const std::string, std::string> UserCollectedProperties;
// TableProperties contains a bunch of read-only properties of its associated
// table.
struct TableProperties {
public:
// the total size of all data blocks.
uint64_t data_size = 0;
// the size of index block.
uint64_t index_size = 0;
// the size of filter block.
uint64_t filter_size = 0;
// total raw key size
uint64_t raw_key_size = 0;
// total raw value size
uint64_t raw_value_size = 0;
// the number of blocks in this table
uint64_t num_data_blocks = 0;
// the number of entries in this table
uint64_t num_entries = 0;
// format version, reserved for backward compatibility
uint64_t format_version = 0;
// If 0, key is variable length. Otherwise number of bytes for each key.
uint64_t fixed_key_len = 0;
// The name of the filter policy used in this table.
// If no filter policy is used, `filter_policy_name` will be an empty string.
std::string filter_policy_name;
// user collected properties
UserCollectedProperties user_collected_properties;
// convert this object to a human readable form
// @prop_delim: delimiter for each property.
std::string ToString(const std::string& prop_delim = "; ",
const std::string& kv_delim = "=") const;
};
// table properties' human-readable names in the property block.
struct TablePropertiesNames {
static const std::string kDataSize;
static const std::string kIndexSize;
static const std::string kFilterSize;
static const std::string kRawKeySize;
static const std::string kRawValueSize;
static const std::string kNumDataBlocks;
static const std::string kNumEntries;
static const std::string kFormatVersion;
static const std::string kFixedKeyLen;
static const std::string kFilterPolicy;
};
extern const std::string kPropertiesBlock;
// `TablePropertiesCollector` provides the mechanism for users to collect
// their own interested properties. This class is essentially a collection
// of callback functions that will be invoked during table building.
// It is construced with TablePropertiesCollectorFactory. The methods don't
// need to be thread-safe, as we will create exactly one
// TablePropertiesCollector object per table and then call it sequentially
class TablePropertiesCollector {
public:
virtual ~TablePropertiesCollector() {}
// Add() will be called when a new key/value pair is inserted into the table.
// @params key the original key that is inserted into the table.
// @params value the original value that is inserted into the table.
virtual Status Add(const Slice& key, const Slice& value) = 0;
// Finish() will be called when a table has already been built and is ready
// for writing the properties block.
// @params properties User will add their collected statistics to
// `properties`.
virtual Status Finish(UserCollectedProperties* properties) = 0;
// Return the human-readable properties, where the key is property name and
// the value is the human-readable form of value.
virtual UserCollectedProperties GetReadableProperties() const = 0;
// The name of the properties collector can be used for debugging purpose.
virtual const char* Name() const = 0;
};
// Constructs TablePropertiesCollector. Internals create a new
// TablePropertiesCollector for each new table
class TablePropertiesCollectorFactory {
public:
virtual ~TablePropertiesCollectorFactory() {}
// has to be thread-safe
virtual TablePropertiesCollector* CreateTablePropertiesCollector() = 0;
// The name of the properties collector can be used for debugging purpose.
virtual const char* Name() const = 0;
};
// Extra properties
// Below is a list of non-basic properties that are collected by database
// itself. Especially some properties regarding to the internal keys (which
// is unknown to `table`).
extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
} // namespace rocksdb

View File

@@ -0,0 +1,104 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
#include "rocksdb/status.h"
#include "rocksdb/types.h"
#include "rocksdb/write_batch.h"
#include <memory>
#include <vector>
namespace rocksdb {
class LogFile;
typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
enum WalFileType {
/* Indicates that WAL file is in archive directory. WAL files are moved from
* the main db directory to archive directory once they are not live and stay
* there until cleaned up. Files are cleaned depending on archive size
* (Options::WAL_size_limit_MB) and time since last cleaning
* (Options::WAL_ttl_seconds).
*/
kArchivedLogFile = 0,
/* Indicates that WAL file is live and resides in the main db directory */
kAliveLogFile = 1
} ;
class LogFile {
public:
LogFile() {}
virtual ~LogFile() {}
// Returns log file's pathname relative to the main db dir
// Eg. For a live-log-file = /000003.log
// For an archived-log-file = /archive/000003.log
virtual std::string PathName() const = 0;
// Primary identifier for log file.
// This is directly proportional to creation time of the log file
virtual uint64_t LogNumber() const = 0;
// Log file can be either alive or archived
virtual WalFileType Type() const = 0;
// Starting sequence number of writebatch written in this log file
virtual SequenceNumber StartSequence() const = 0;
// Size of log file on disk in Bytes
virtual uint64_t SizeFileBytes() const = 0;
};
struct BatchResult {
SequenceNumber sequence = 0;
std::unique_ptr<WriteBatch> writeBatchPtr;
};
// A TransactionLogIterator is used to iterate over the transactions in a db.
// One run of the iterator is continuous, i.e. the iterator will stop at the
// beginning of any gap in sequences
class TransactionLogIterator {
public:
TransactionLogIterator() {}
virtual ~TransactionLogIterator() {}
// An iterator is either positioned at a WriteBatch or not valid.
// This method returns true if the iterator is valid.
// Can read data from a valid iterator.
virtual bool Valid() = 0;
// Moves the iterator to the next WriteBatch.
// REQUIRES: Valid() to be true.
virtual void Next() = 0;
// Returns ok if the iterator is valid.
// Returns the Error when something has gone wrong.
virtual Status status() = 0;
// If valid return's the current write_batch and the sequence number of the
// earliest transaction contained in the batch.
// ONLY use if Valid() is true and status() is OK.
virtual BatchResult GetBatch() = 0;
// The read options for TransactionLogIterator.
struct ReadOptions {
// If true, all data read from underlying storage will be
// verified against corresponding checksums.
// Default: true
bool verify_checksums_;
ReadOptions() : verify_checksums_(true) {}
explicit ReadOptions(bool verify_checksums)
: verify_checksums_(verify_checksums) {}
};
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_

20
include/rocksdb/types.h Normal file
View File

@@ -0,0 +1,20 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_
#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_
#include <stdint.h>
namespace rocksdb {
// Define all public custom types here.
// Represents a sequence number in a WAL file.
typedef uint64_t SequenceNumber;
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_TYPES_H_

View File

@@ -0,0 +1,84 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
#include <stdint.h>
#include <climits>
#include <vector>
namespace rocksdb {
//
// Algorithm used to make a compaction request stop picking new files
// into a single compaction run
//
enum CompactionStopStyle {
kCompactionStopStyleSimilarSize, // pick files of similar size
kCompactionStopStyleTotalSize // total size of picked files > next file
};
class CompactionOptionsUniversal {
public:
// Percentage flexibilty while comparing file size. If the candidate file(s)
// size is 1% smaller than the next file's size, then include next file into
// this candidate set. // Default: 1
unsigned int size_ratio;
// The minimum number of files in a single compaction run. Default: 2
unsigned int min_merge_width;
// The maximum number of files in a single compaction run. Default: UINT_MAX
unsigned int max_merge_width;
// The size amplification is defined as the amount (in percentage) of
// additional storage needed to store a single byte of data in the database.
// For example, a size amplification of 2% means that a database that
// contains 100 bytes of user-data may occupy upto 102 bytes of
// physical storage. By this definition, a fully compacted database has
// a size amplification of 0%. Rocksdb uses the following heuristic
// to calculate size amplification: it assumes that all files excluding
// the earliest file contribute to the size amplification.
// Default: 200, which means that a 100 byte database could require upto
// 300 bytes of storage.
unsigned int max_size_amplification_percent;
// If this option is set to be -1 (the default value), all the output files
// will follow compression type specified.
//
// If this option is not negative, we will try to make sure compressed
// size is just above this value. In normal cases, at least this percentage
// of data will be compressed.
// When we are compacting to a new file, here is the criteria whether
// it needs to be compressed: assuming here are the list of files sorted
// by generation time:
// A1...An B1...Bm C1...Ct
// where A1 is the newest and Ct is the oldest, and we are going to compact
// B1...Bm, we calculate the total size of all the files as total_size, as
// well as the total size of C1...Ct as total_C, the compaction output file
// will be compressed iff
// total_C / total_size < this percentage
// Default: -1
int compression_size_percent;
// The algorithm used to stop picking files into a single compaction run
// Default: kCompactionStopStyleTotalSize
CompactionStopStyle stop_style;
// Default set of parameters
CompactionOptionsUniversal()
: size_ratio(1),
min_merge_width(2),
max_merge_width(UINT_MAX),
max_size_amplification_percent(200),
compression_size_percent(-1),
stop_style(kCompactionStopStyleTotalSize) {}
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H

View File

@@ -0,0 +1,252 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#ifndef ROCKSDB_LITE
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <string>
#include <map>
#include <vector>
#include "rocksdb/utilities/stackable_db.h"
#include "rocksdb/env.h"
#include "rocksdb/status.h"
namespace rocksdb {
struct BackupableDBOptions {
// Where to keep the backup files. Has to be different than dbname_
// Best to set this to dbname_ + "/backups"
// Required
std::string backup_dir;
// Backup Env object. It will be used for backup file I/O. If it's
// nullptr, backups will be written out using DBs Env. If it's
// non-nullptr, backup's I/O will be performed using this object.
// If you want to have backups on HDFS, use HDFS Env here!
// Default: nullptr
Env* backup_env;
// If share_table_files == true, backup will assume that table files with
// same name have the same contents. This enables incremental backups and
// avoids unnecessary data copies.
// If share_table_files == false, each backup will be on its own and will
// not share any data with other backups.
// default: true
bool share_table_files;
// Backup info and error messages will be written to info_log
// if non-nullptr.
// Default: nullptr
Logger* info_log;
// If sync == true, we can guarantee you'll get consistent backup even
// on a machine crash/reboot. Backup process is slower with sync enabled.
// If sync == false, we don't guarantee anything on machine reboot. However,
// chances are some of the backups are consistent.
// Default: true
bool sync;
// If true, it will delete whatever backups there are already
// Default: false
bool destroy_old_data;
// If false, we won't backup log files. This option can be useful for backing
// up in-memory databases where log file are persisted, but table files are in
// memory.
// Default: true
bool backup_log_files;
// Max bytes that can be transferred in a second during backup.
// If 0, go as fast as you can
// Default: 0
uint64_t backup_rate_limit;
// Max bytes that can be transferred in a second during restore.
// If 0, go as fast as you can
// Default: 0
uint64_t restore_rate_limit;
// Only used if share_table_files is set to true. If true, will consider that
// backups can come from different databases, hence a sst is not uniquely
// identifed by its name, but by the triple (file name, crc32, file length)
// Default: false
// Note: this is an experimental option, and you'll need to set it manually
// *turn it on only if you know what you're doing*
bool share_files_with_checksum;
void Dump(Logger* logger) const;
explicit BackupableDBOptions(const std::string& _backup_dir,
Env* _backup_env = nullptr,
bool _share_table_files = true,
Logger* _info_log = nullptr, bool _sync = true,
bool _destroy_old_data = false,
bool _backup_log_files = true,
uint64_t _backup_rate_limit = 0,
uint64_t _restore_rate_limit = 0)
: backup_dir(_backup_dir),
backup_env(_backup_env),
share_table_files(_share_table_files),
info_log(_info_log),
sync(_sync),
destroy_old_data(_destroy_old_data),
backup_log_files(_backup_log_files),
backup_rate_limit(_backup_rate_limit),
restore_rate_limit(_restore_rate_limit),
share_files_with_checksum(false) {
assert(share_table_files || !share_files_with_checksum);
}
};
struct RestoreOptions {
// If true, restore won't overwrite the existing log files in wal_dir. It will
// also move all log files from archive directory to wal_dir. Use this option
// in combination with BackupableDBOptions::backup_log_files = false for
// persisting in-memory databases.
// Default: false
bool keep_log_files;
explicit RestoreOptions(bool _keep_log_files = false)
: keep_log_files(_keep_log_files) {}
};
typedef uint32_t BackupID;
struct BackupInfo {
BackupID backup_id;
int64_t timestamp;
uint64_t size;
BackupInfo() {}
BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size)
: backup_id(_backup_id), timestamp(_timestamp), size(_size) {}
};
class BackupEngineReadOnly {
public:
virtual ~BackupEngineReadOnly() {}
static BackupEngineReadOnly* NewReadOnlyBackupEngine(
Env* db_env, const BackupableDBOptions& options);
// You can GetBackupInfo safely, even with other BackupEngine performing
// backups on the same directory
virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
// Restoring DB from backup is NOT safe when there is another BackupEngine
// running that might call DeleteBackup() or PurgeOldBackups(). It is caller's
// responsibility to synchronize the operation, i.e. don't delete the backup
// when you're restoring from it
virtual Status RestoreDBFromBackup(
BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
const RestoreOptions& restore_options = RestoreOptions()) = 0;
virtual Status RestoreDBFromLatestBackup(
const std::string& db_dir, const std::string& wal_dir,
const RestoreOptions& restore_options = RestoreOptions()) = 0;
};
// Please see the documentation in BackupableDB and RestoreBackupableDB
class BackupEngine {
public:
virtual ~BackupEngine() {}
static BackupEngine* NewBackupEngine(Env* db_env,
const BackupableDBOptions& options);
virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0;
virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
virtual Status DeleteBackup(BackupID backup_id) = 0;
virtual void StopBackup() = 0;
virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
virtual Status RestoreDBFromBackup(
BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
const RestoreOptions& restore_options = RestoreOptions()) = 0;
virtual Status RestoreDBFromLatestBackup(
const std::string& db_dir, const std::string& wal_dir,
const RestoreOptions& restore_options = RestoreOptions()) = 0;
};
// Stack your DB with BackupableDB to be able to backup the DB
class BackupableDB : public StackableDB {
public:
// BackupableDBOptions have to be the same as the ones used in a previous
// incarnation of the DB
//
// BackupableDB ownes the pointer `DB* db` now. You should not delete it or
// use it after the invocation of BackupableDB
BackupableDB(DB* db, const BackupableDBOptions& options);
virtual ~BackupableDB();
// Captures the state of the database in the latest backup
// NOT a thread safe call
Status CreateNewBackup(bool flush_before_backup = false);
// Returns info about backups in backup_info
void GetBackupInfo(std::vector<BackupInfo>* backup_info);
// deletes old backups, keeping latest num_backups_to_keep alive
Status PurgeOldBackups(uint32_t num_backups_to_keep);
// deletes a specific backup
Status DeleteBackup(BackupID backup_id);
// Call this from another thread if you want to stop the backup
// that is currently happening. It will return immediatelly, will
// not wait for the backup to stop.
// The backup will stop ASAP and the call to CreateNewBackup will
// return Status::Incomplete(). It will not clean up after itself, but
// the state will remain consistent. The state will be cleaned up
// next time you create BackupableDB or RestoreBackupableDB.
void StopBackup();
private:
BackupEngine* backup_engine_;
};
// Use this class to access information about backups and restore from them
class RestoreBackupableDB {
public:
RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options);
~RestoreBackupableDB();
// Returns info about backups in backup_info
void GetBackupInfo(std::vector<BackupInfo>* backup_info);
// restore from backup with backup_id
// IMPORTANT -- if options_.share_table_files == true and you restore DB
// from some backup that is not the latest, and you start creating new
// backups from the new DB, they will probably fail
//
// Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
// If you add new data to the DB and try creating a new backup now, the
// database will diverge from backups 4 and 5 and the new backup will fail.
// If you want to create new backup, you will first have to delete backups 4
// and 5.
Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
const std::string& wal_dir,
const RestoreOptions& restore_options =
RestoreOptions());
// restore from the latest backup
Status RestoreDBFromLatestBackup(const std::string& db_dir,
const std::string& wal_dir,
const RestoreOptions& restore_options =
RestoreOptions());
// deletes old backups, keeping latest num_backups_to_keep alive
Status PurgeOldBackups(uint32_t num_backups_to_keep);
// deletes a specific backup
Status DeleteBackup(BackupID backup_id);
private:
BackupEngine* backup_engine_;
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

View File

@@ -0,0 +1,68 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include <vector>
#include "rocksdb/utilities/stackable_db.h"
#include "rocksdb/db.h"
namespace rocksdb {
// Database with TTL support.
//
// USE-CASES:
// This API should be used to open the db when key-values inserted are
// meant to be removed from the db in a non-strict 'ttl' amount of time
// Therefore, this guarantees that key-values inserted will remain in the
// db for >= ttl amount of time and the db will make efforts to remove the
// key-values as soon as possible after ttl seconds of their insertion.
//
// BEHAVIOUR:
// TTL is accepted in seconds
// (int32_t)Timestamp(creation) is suffixed to values in Put internally
// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
// Get/Iterator may return expired entries(compaction not run on them yet)
// Different TTL may be used during different Opens
// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
// Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
// read_only=true opens in the usual read-only mode. Compactions will not be
// triggered(neither manual nor automatic), so no expired entries removed
//
// CONSTRAINTS:
// Not specifying/passing or non-positive TTL behaves like TTL = infinity
//
// !!!WARNING!!!:
// Calling DB::Open directly to re-open a db created by this API will get
// corrupt values(timestamp suffixed) and no ttl effect will be there
// during the second Open, so use this API consistently to open the db
// Be careful when passing ttl with a small positive value because the
// whole database may be deleted in a small amount of time
class DBWithTTL : public StackableDB {
public:
virtual Status CreateColumnFamilyWithTtl(
const ColumnFamilyOptions& options, const std::string& column_family_name,
ColumnFamilyHandle** handle, int ttl) = 0;
static Status Open(const Options& options, const std::string& dbname,
DBWithTTL** dbptr, int32_t ttl = 0,
bool read_only = false);
static Status Open(const DBOptions& db_options, const std::string& dbname,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles,
DBWithTTL** dbptr, std::vector<int32_t> ttls,
bool read_only = false);
protected:
explicit DBWithTTL(DB* db) : StackableDB(db) {}
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

View File

@@ -0,0 +1,149 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include <vector>
#include "rocksdb/utilities/stackable_db.h"
#include "rocksdb/utilities/json_document.h"
#include "rocksdb/db.h"
namespace rocksdb {
// IMPORTANT: DocumentDB is a work in progress. It is unstable and we might
// change the API without warning. Talk to RocksDB team before using this in
// production ;)
// DocumentDB is a layer on top of RocksDB that provides a very simple JSON API.
// When creating a DB, you specify a list of indexes you want to keep on your
// data. You can insert a JSON document to the DB, which is automatically
// indexed. Every document added to the DB needs to have "_id" field which is
// automatically indexed and is an unique primary key. All other indexes are
// non-unique.
// NOTE: field names in the JSON are NOT allowed to start with '$' or
// contain '.'. We don't currently enforce that rule, but will start behaving
// badly.
// Cursor is what you get as a result of executing query. To get all
// results from a query, call Next() on a Cursor while Valid() returns true
class Cursor {
public:
Cursor() = default;
virtual ~Cursor() {}
virtual bool Valid() const = 0;
virtual void Next() = 0;
// Lifecycle of the returned JSONDocument is until the next Next() call
virtual const JSONDocument& document() const = 0;
virtual Status status() const = 0;
private:
// No copying allowed
Cursor(const Cursor&);
void operator=(const Cursor&);
};
struct DocumentDBOptions {
int background_threads = 4;
uint64_t memtable_size = 128 * 1024 * 1024; // 128 MB
uint64_t cache_size = 1 * 1024 * 1024 * 1024; // 1 GB
};
// TODO(icanadi) Add `JSONDocument* info` parameter to all calls that can be
// used by the caller to get more information about the call execution (number
// of dropped records, number of updated records, etc.)
class DocumentDB : public StackableDB {
public:
struct IndexDescriptor {
// Currently, you can only define an index on a single field. To specify an
// index on a field X, set index description to JSON "{X: 1}"
// Currently the value needs to be 1, which means ascending.
// In the future, we plan to also support indexes on multiple keys, where
// you could mix ascending sorting (1) with descending sorting indexes (-1)
JSONDocument* description;
std::string name;
};
// Open DocumentDB with specified indexes. The list of indexes has to be
// complete, i.e. include all indexes present in the DB, except the primary
// key index.
// Otherwise, Open() will return an error
static Status Open(const DocumentDBOptions& options, const std::string& name,
const std::vector<IndexDescriptor>& indexes,
DocumentDB** db, bool read_only = false);
explicit DocumentDB(DB* db) : StackableDB(db) {}
// Create a new index. It will stop all writes for the duration of the call.
// All current documents in the DB are scanned and corresponding index entries
// are created
virtual Status CreateIndex(const WriteOptions& write_options,
const IndexDescriptor& index) = 0;
// Drop an index. Client is responsible to make sure that index is not being
// used by currently executing queries
virtual Status DropIndex(const std::string& name) = 0;
// Insert a document to the DB. The document needs to have a primary key "_id"
// which can either be a string or an integer. Otherwise the write will fail
// with InvalidArgument.
virtual Status Insert(const WriteOptions& options,
const JSONDocument& document) = 0;
// Deletes all documents matching a filter atomically
virtual Status Remove(const ReadOptions& read_options,
const WriteOptions& write_options,
const JSONDocument& query) = 0;
// Does this sequence of operations:
// 1. Find all documents matching a filter
// 2. For all documents, atomically:
// 2.1. apply the update operators
// 2.2. update the secondary indexes
//
// Currently only $set update operator is supported.
// Syntax is: {$set: {key1: value1, key2: value2, etc...}}
// This operator will change a document's key1 field to value1, key2 to
// value2, etc. New values will be set even if a document didn't have an entry
// for the specified key.
//
// You can not change a primary key of a document.
//
// Update example: Update({id: {$gt: 5}, $index: id}, {$set: {enabled: true}})
virtual Status Update(const ReadOptions& read_options,
const WriteOptions& write_options,
const JSONDocument& filter,
const JSONDocument& updates) = 0;
// query has to be an array in which every element is an operator. Currently
// only $filter operator is supported. Syntax of $filter operator is:
// {$filter: {key1: condition1, key2: condition2, etc.}} where conditions can
// be either:
// 1) a single value in which case the condition is equality condition, or
// 2) a defined operators, like {$gt: 4}, which will match all documents that
// have key greater than 4.
//
// Supported operators are:
// 1) $gt -- greater than
// 2) $gte -- greater than or equal
// 3) $lt -- less than
// 4) $lte -- less than or equal
// If you want the filter to use an index, you need to specify it like this:
// {$filter: {...(conditions)..., $index: index_name}}
//
// Example query:
// * [{$filter: {name: John, age: {$gte: 18}, $index: age}}]
// will return all Johns whose age is greater or equal to 18 and it will use
// index "age" to satisfy the query.
virtual Cursor* Query(const ReadOptions& read_options,
const JSONDocument& query) = 0;
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

View File

@@ -0,0 +1,105 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#ifndef ROCKSDB_LITE
#pragma once
#include <string>
#include <vector>
#include "rocksdb/utilities/stackable_db.h"
#include "rocksdb/status.h"
namespace rocksdb {
//
// Configurable options needed for setting up a Geo database
//
struct GeoDBOptions {
// Backup info and error messages will be written to info_log
// if non-nullptr.
// Default: nullptr
Logger* info_log;
explicit GeoDBOptions(Logger* _info_log = nullptr):info_log(_info_log) { }
};
//
// A position in the earth's geoid
//
class GeoPosition {
public:
double latitude;
double longitude;
explicit GeoPosition(double la = 0, double lo = 0) :
latitude(la), longitude(lo) {
}
};
//
// Description of an object on the Geoid. It is located by a GPS location,
// and is identified by the id. The value associated with this object is
// an opaque string 'value'. Different objects identified by unique id's
// can have the same gps-location associated with them.
//
class GeoObject {
public:
GeoPosition position;
std::string id;
std::string value;
GeoObject() {}
GeoObject(const GeoPosition& pos, const std::string& i,
const std::string& val) :
position(pos), id(i), value(val) {
}
};
//
// Stack your DB with GeoDB to be able to get geo-spatial support
//
class GeoDB : public StackableDB {
public:
// GeoDBOptions have to be the same as the ones used in a previous
// incarnation of the DB
//
// GeoDB owns the pointer `DB* db` now. You should not delete it or
// use it after the invocation of GeoDB
// GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
virtual ~GeoDB() {}
// Insert a new object into the location database. The object is
// uniquely identified by the id. If an object with the same id already
// exists in the db, then the old one is overwritten by the new
// object being inserted here.
virtual Status Insert(const GeoObject& object) = 0;
// Retrieve the value of the object located at the specified GPS
// location and is identified by the 'id'.
virtual Status GetByPosition(const GeoPosition& pos,
const Slice& id, std::string* value) = 0;
// Retrieve the value of the object identified by the 'id'. This method
// could be potentially slower than GetByPosition
virtual Status GetById(const Slice& id, GeoObject* object) = 0;
// Delete the specified object
virtual Status Remove(const Slice& id) = 0;
// Returns a list of all items within a circular radius from the
// specified gps location. If 'number_of_values' is specified,
// then this call returns at most that many number of objects.
// The radius is specified in 'meters'.
virtual Status SearchRadial(const GeoPosition& pos,
double radius,
std::vector<GeoObject>* values,
int number_of_values = INT_MAX) = 0;
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

View File

@@ -0,0 +1,174 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include <map>
#include <unordered_map>
#include <vector>
#include "rocksdb/slice.h"
// We use JSONDocument for DocumentDB API
// Implementation inspired by folly::dynamic and rapidjson
namespace rocksdb {
// NOTE: none of this is thread-safe
class JSONDocument {
public:
// return nullptr on parse failure
static JSONDocument* ParseJSON(const char* json);
enum Type {
kNull,
kArray,
kBool,
kDouble,
kInt64,
kObject,
kString,
};
JSONDocument(); // null
/* implicit */ JSONDocument(bool b);
/* implicit */ JSONDocument(double d);
/* implicit */ JSONDocument(int64_t i);
/* implicit */ JSONDocument(const std::string& s);
/* implicit */ JSONDocument(const char* s);
// constructs JSONDocument of specific type with default value
explicit JSONDocument(Type type);
// copy constructor
JSONDocument(const JSONDocument& json_document);
~JSONDocument();
Type type() const;
// REQUIRES: IsObject()
bool Contains(const std::string& key) const;
// Returns nullptr if !Contains()
// don't delete the returned pointer
// REQUIRES: IsObject()
const JSONDocument* Get(const std::string& key) const;
// REQUIRES: IsObject()
JSONDocument& operator[](const std::string& key);
// REQUIRES: IsObject()
const JSONDocument& operator[](const std::string& key) const;
// returns `this`, so you can chain operations.
// Copies value
// REQUIRES: IsObject()
JSONDocument* Set(const std::string& key, const JSONDocument& value);
// REQUIRES: IsArray() == true || IsObject() == true
size_t Count() const;
// REQUIRES: IsArray()
const JSONDocument* GetFromArray(size_t i) const;
// REQUIRES: IsArray()
JSONDocument& operator[](size_t i);
// REQUIRES: IsArray()
const JSONDocument& operator[](size_t i) const;
// returns `this`, so you can chain operations.
// Copies the value
// REQUIRES: IsArray() && i < Count()
JSONDocument* SetInArray(size_t i, const JSONDocument& value);
// REQUIRES: IsArray()
JSONDocument* PushBack(const JSONDocument& value);
bool IsNull() const;
bool IsArray() const;
bool IsBool() const;
bool IsDouble() const;
bool IsInt64() const;
bool IsObject() const;
bool IsString() const;
// REQUIRES: IsBool() == true
bool GetBool() const;
// REQUIRES: IsDouble() == true
double GetDouble() const;
// REQUIRES: IsInt64() == true
int64_t GetInt64() const;
// REQUIRES: IsString() == true
const std::string& GetString() const;
bool operator==(const JSONDocument& rhs) const;
std::string DebugString() const;
private:
class ItemsIteratorGenerator;
public:
// REQUIRES: IsObject()
ItemsIteratorGenerator Items() const;
// appends serialized object to dst
void Serialize(std::string* dst) const;
// returns nullptr if Slice doesn't represent valid serialized JSONDocument
static JSONDocument* Deserialize(const Slice& src);
private:
void SerializeInternal(std::string* dst, bool type_prefix) const;
// returns false if Slice doesn't represent valid serialized JSONDocument.
// Otherwise, true
bool DeserializeInternal(Slice* input);
typedef std::vector<JSONDocument*> Array;
typedef std::unordered_map<std::string, JSONDocument*> Object;
// iteration on objects
class const_item_iterator {
public:
typedef Object::const_iterator It;
typedef Object::value_type value_type;
/* implicit */ const_item_iterator(It it) : it_(it) {}
It& operator++() { return ++it_; }
bool operator!=(const const_item_iterator& other) {
return it_ != other.it_;
}
value_type operator*() { return *it_; }
private:
It it_;
};
class ItemsIteratorGenerator {
public:
/* implicit */ ItemsIteratorGenerator(const Object& object)
: object_(object) {}
const_item_iterator begin() { return object_.begin(); }
const_item_iterator end() { return object_.end(); }
private:
const Object& object_;
};
union Data {
Data() : n(nullptr) {}
~Data() {}
void* n;
Array a;
bool b;
double d;
int64_t i;
std::string s;
Object o;
} data_;
const Type type_;
// Our serialization format's first byte specifies the encoding version. That
// way, we can easily change our format while providing backwards
// compatibility. This constant specifies the current version of the
// serialization format
static const char kSerializationFormatVersion;
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

View File

@@ -0,0 +1,236 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include <vector>
#include "rocksdb/db.h"
#include "rocksdb/slice.h"
#include "rocksdb/utilities/stackable_db.h"
namespace rocksdb {
namespace spatial {
// NOTE: SpatialDB is experimental and we might change its API without warning.
// Please talk to us before developing against SpatialDB API.
//
// SpatialDB is a support for spatial indexes built on top of RocksDB.
// When creating a new SpatialDB, clients specifies a list of spatial indexes to
// build on their data. Each spatial index is defined by the area and
// granularity. If you're storing map data, different spatial index
// granularities can be used for different zoom levels.
//
// Each element inserted into SpatialDB has:
// * a bounding box, which determines how will the element be indexed
// * string blob, which will usually be WKB representation of the polygon
// (http://en.wikipedia.org/wiki/Well-known_text)
// * feature set, which is a map of key-value pairs, where value can be null,
// int, double, bool, string
// * a list of indexes to insert the element in
//
// Each query is executed on a single spatial index. Query guarantees that it
// will return all elements intersecting the specified bounding box, but it
// might also return some extra non-intersecting elements.
// Variant is a class that can be many things: null, bool, int, double or string
// It is used to store different value types in FeatureSet (see below)
struct Variant {
// Don't change the values here, they are persisted on disk
enum Type {
kNull = 0x0,
kBool = 0x1,
kInt = 0x2,
kDouble = 0x3,
kString = 0x4,
};
Variant() : type_(kNull) {}
/* implicit */ Variant(bool b) : type_(kBool) { data_.b = b; }
/* implicit */ Variant(uint64_t i) : type_(kInt) { data_.i = i; }
/* implicit */ Variant(double d) : type_(kDouble) { data_.d = d; }
/* implicit */ Variant(const std::string& s) : type_(kString) {
new (&data_.s) std::string(s);
}
Variant(const Variant& v);
~Variant() {
if (type_ == kString) {
using std::string;
(&data_.s)->~string();
}
}
Type type() const { return type_; }
bool get_bool() const { return data_.b; }
uint64_t get_int() const { return data_.i; }
double get_double() const { return data_.d; }
const std::string& get_string() const { return data_.s; }
bool operator==(const Variant& other);
bool operator!=(const Variant& other);
private:
Type type_;
union Data {
Data() {}
~Data() {}
bool b;
uint64_t i;
double d;
std::string s;
} data_;
};
// FeatureSet is a map of key-value pairs. One feature set is associated with
// each element in SpatialDB. It can be used to add rich data about the element.
class FeatureSet {
private:
typedef std::unordered_map<std::string, Variant> map;
public:
class iterator {
public:
/* implicit */ iterator(const map::const_iterator itr) : itr_(itr) {}
iterator& operator++() {
++itr_;
return *this;
}
bool operator!=(const iterator& other) { return itr_ != other.itr_; }
bool operator==(const iterator& other) { return itr_ == other.itr_; }
map::value_type operator*() { return *itr_; }
private:
map::const_iterator itr_;
};
FeatureSet() = default;
FeatureSet* Set(const std::string& key, const Variant& value);
bool Contains(const std::string& key) const;
// REQUIRES: Contains(key)
const Variant& Get(const std::string& key) const;
iterator Find(const std::string& key) const;
iterator begin() const { return map_.begin(); }
iterator end() const { return map_.end(); }
void Clear();
size_t Size() const { return map_.size(); }
void Serialize(std::string* output) const;
// REQUIRED: empty FeatureSet
bool Deserialize(const Slice& input);
std::string DebugString() const;
private:
map map_;
};
// BoundingBox is a helper structure for defining rectangles representing
// bounding boxes of spatial elements.
template <typename T>
struct BoundingBox {
T min_x, min_y, max_x, max_y;
BoundingBox() = default;
BoundingBox(T _min_x, T _min_y, T _max_x, T _max_y)
: min_x(_min_x), min_y(_min_y), max_x(_max_x), max_y(_max_y) {}
bool Intersects(const BoundingBox<T>& a) const {
return !(min_x > a.max_x || min_y > a.max_y || a.min_x > max_x ||
a.min_y > max_y);
}
};
struct SpatialDBOptions {
uint64_t cache_size = 1 * 1024 * 1024 * 1024LL; // 1GB
int num_threads = 16;
bool bulk_load = true;
};
// Cursor is used to return data from the query to the client. To get all the
// data from the query, just call Next() while Valid() is true
class Cursor {
public:
Cursor() = default;
virtual ~Cursor() {}
virtual bool Valid() const = 0;
// REQUIRES: Valid()
virtual void Next() = 0;
// Lifetime of the underlying storage until the next call to Next()
// REQUIRES: Valid()
virtual const Slice blob() = 0;
// Lifetime of the underlying storage until the next call to Next()
// REQUIRES: Valid()
virtual const FeatureSet& feature_set() = 0;
virtual Status status() const = 0;
private:
// No copying allowed
Cursor(const Cursor&);
void operator=(const Cursor&);
};
// SpatialIndexOptions defines a spatial index that will be built on the data
struct SpatialIndexOptions {
// Spatial indexes are referenced by names
std::string name;
// An area that is indexed. If the element is not intersecting with spatial
// index's bbox, it will not be inserted into the index
BoundingBox<double> bbox;
// tile_bits control the granularity of the spatial index. Each dimension of
// the bbox will be split into (1 << tile_bits) tiles, so there will be a
// total of (1 << tile_bits)^2 tiles. It is recommended to configure a size of
// each tile to be approximately the size of the query on that spatial index
uint32_t tile_bits;
SpatialIndexOptions() {}
SpatialIndexOptions(const std::string& _name,
const BoundingBox<double>& _bbox, uint32_t _tile_bits)
: name(_name), bbox(_bbox), tile_bits(_tile_bits) {}
};
class SpatialDB : public StackableDB {
public:
// Creates the SpatialDB with specified list of indexes.
// REQUIRED: db doesn't exist
static Status Create(const SpatialDBOptions& options, const std::string& name,
const std::vector<SpatialIndexOptions>& spatial_indexes);
// Open the existing SpatialDB. The resulting db object will be returned
// through db parameter.
// REQUIRED: db was created using SpatialDB::Create
static Status Open(const SpatialDBOptions& options, const std::string& name,
SpatialDB** db, bool read_only = false);
explicit SpatialDB(DB* db) : StackableDB(db) {}
// Insert the element into the DB. Element will be inserted into specified
// spatial_indexes, based on specified bbox.
// REQUIRES: spatial_indexes.size() > 0
virtual Status Insert(const WriteOptions& write_options,
const BoundingBox<double>& bbox, const Slice& blob,
const FeatureSet& feature_set,
const std::vector<std::string>& spatial_indexes) = 0;
// Calling Compact() after inserting a bunch of elements should speed up
// reading. This is especially useful if you use SpatialDBOptions::bulk_load
virtual Status Compact() = 0;
// Query the specified spatial_index. Query will return all elements that
// intersect bbox, but it may also return some extra elements.
virtual Cursor* Query(const ReadOptions& read_options,
const BoundingBox<double>& bbox,
const std::string& spatial_index) = 0;
};
} // namespace spatial
} // namespace rocksdb
#endif // ROCKSDB_LITE

View File

@@ -0,0 +1,226 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "rocksdb/db.h"
namespace rocksdb {
// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
class StackableDB : public DB {
public:
// StackableDB is the owner of db now!
explicit StackableDB(DB* db) : db_(db) {}
~StackableDB() {
delete db_;
}
virtual DB* GetBaseDB() {
return db_;
}
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
const std::string& column_family_name,
ColumnFamilyHandle** handle) {
return db_->CreateColumnFamily(options, column_family_name, handle);
}
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) {
return db_->DropColumnFamily(column_family);
}
using DB::Put;
virtual Status Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& val) override {
return db_->Put(options, column_family, key, val);
}
using DB::Get;
virtual Status Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) override {
return db_->Get(options, column_family, key, value);
}
using DB::MultiGet;
virtual std::vector<Status> MultiGet(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys,
std::vector<std::string>* values) override {
return db_->MultiGet(options, column_family, keys, values);
}
using DB::KeyMayExist;
virtual bool KeyMayExist(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value,
bool* value_found = nullptr) override {
return db_->KeyMayExist(options, column_family, key, value, value_found);
}
using DB::Delete;
virtual Status Delete(const WriteOptions& wopts,
ColumnFamilyHandle* column_family,
const Slice& key) override {
return db_->Delete(wopts, column_family, key);
}
using DB::Merge;
virtual Status Merge(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override {
return db_->Merge(options, column_family, key, value);
}
virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
override {
return db_->Write(opts, updates);
}
using DB::NewIterator;
virtual Iterator* NewIterator(const ReadOptions& opts,
ColumnFamilyHandle* column_family) override {
return db_->NewIterator(opts, column_family);
}
virtual Status NewIterators(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_families,
std::vector<Iterator*>* iterators) {
return db_->NewIterators(options, column_families, iterators);
}
virtual const Snapshot* GetSnapshot() override {
return db_->GetSnapshot();
}
virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
return db_->ReleaseSnapshot(snapshot);
}
using DB::GetProperty;
virtual bool GetProperty(ColumnFamilyHandle* column_family,
const Slice& property, std::string* value) override {
return db_->GetProperty(column_family, property, value);
}
using DB::GetIntProperty;
virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
const Slice& property, uint64_t* value) override {
return db_->GetIntProperty(column_family, property, value);
}
using DB::GetApproximateSizes;
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* r, int n,
uint64_t* sizes) override {
return db_->GetApproximateSizes(column_family, r, n, sizes);
}
using DB::CompactRange;
virtual Status CompactRange(ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end,
bool reduce_level = false, int target_level = -1,
uint32_t target_path_id = 0) override {
return db_->CompactRange(column_family, begin, end, reduce_level,
target_level, target_path_id);
}
using DB::NumberLevels;
virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
return db_->NumberLevels(column_family);
}
using DB::MaxMemCompactionLevel;
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family)
override {
return db_->MaxMemCompactionLevel(column_family);
}
using DB::Level0StopWriteTrigger;
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family)
override {
return db_->Level0StopWriteTrigger(column_family);
}
virtual const std::string& GetName() const override {
return db_->GetName();
}
virtual Env* GetEnv() const override {
return db_->GetEnv();
}
using DB::GetOptions;
virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
override {
return db_->GetOptions(column_family);
}
using DB::Flush;
virtual Status Flush(const FlushOptions& fopts,
ColumnFamilyHandle* column_family) override {
return db_->Flush(fopts, column_family);
}
virtual Status DisableFileDeletions() override {
return db_->DisableFileDeletions();
}
virtual Status EnableFileDeletions(bool force) override {
return db_->EnableFileDeletions(force);
}
virtual void GetLiveFilesMetaData(
std::vector<LiveFileMetaData>* metadata) override {
db_->GetLiveFilesMetaData(metadata);
}
virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
bool flush_memtable = true) override {
return db_->GetLiveFiles(vec, mfs, flush_memtable);
}
virtual SequenceNumber GetLatestSequenceNumber() const override {
return db_->GetLatestSequenceNumber();
}
virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
return db_->GetSortedWalFiles(files);
}
virtual Status DeleteFile(std::string name) override {
return db_->DeleteFile(name);
}
virtual Status GetDbIdentity(std::string& identity) {
return db_->GetDbIdentity(identity);
}
using DB::GetPropertiesOfAllTables;
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
TablePropertiesCollection* props) {
return db_->GetPropertiesOfAllTables(column_family, props);
}
virtual Status GetUpdatesSince(
SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
const TransactionLogIterator::ReadOptions& read_options) override {
return db_->GetUpdatesSince(seq_number, iter, read_options);
}
virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
return db_->DefaultColumnFamily();
}
protected:
DB* db_;
};
} // namespace rocksdb

View File

@@ -0,0 +1,30 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#ifndef ROCKSDB_LITE
#include <vector>
#include <string>
#include "rocksdb/utilities/stackable_db.h"
#include "rocksdb/utilities/db_ttl.h"
#include "rocksdb/db.h"
namespace rocksdb {
// Please don't use this class. It's deprecated
class UtilityDB {
public:
// This function is here only for backwards compatibility. Please use the
// functions defined in DBWithTTl (rocksdb/utilities/db_ttl.h)
// (deprecated)
__attribute__((deprecated)) static Status OpenTtlDB(const Options& options,
const std::string& name,
StackableDB** dbptr,
int32_t ttl = 0,
bool read_only = false);
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

17
include/rocksdb/version.h Normal file
View File

@@ -0,0 +1,17 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
// Also update Makefile if you change these
#define ROCKSDB_MAJOR 3
#define ROCKSDB_MINOR 4
#define ROCKSDB_PATCH 0
// Do not use these. We made the mistake of declaring macros starting with
// double underscore. Now we have to live with our choice. We'll deprecate these
// at some point
#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
#define __ROCKSDB_MINOR__ ROCKSDB_MINOR
#define __ROCKSDB_PATCH__ ROCKSDB_PATCH

View File

@@ -0,0 +1,162 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// WriteBatch holds a collection of updates to apply atomically to a DB.
//
// The updates are applied in the order in which they are added
// to the WriteBatch. For example, the value of "key" will be "v3"
// after the following batch is written:
//
// batch.Put("key", "v1");
// batch.Delete("key");
// batch.Put("key", "v2");
// batch.Put("key", "v3");
//
// Multiple threads can invoke const methods on a WriteBatch without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same WriteBatch must use
// external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
#include <string>
#include "rocksdb/status.h"
namespace rocksdb {
class Slice;
class ColumnFamilyHandle;
struct SliceParts;
class WriteBatch {
public:
explicit WriteBatch(size_t reserved_bytes = 0);
~WriteBatch();
// Store the mapping "key->value" in the database.
void Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value);
void Put(const Slice& key, const Slice& value) {
Put(nullptr, key, value);
}
// Variant of Put() that gathers output like writev(2). The key and value
// that will be written to the database are concatentations of arrays of
// slices.
void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value);
void Put(const SliceParts& key, const SliceParts& value) {
Put(nullptr, key, value);
}
// Merge "value" with the existing value of "key" in the database.
// "key->merge(existing, value)"
void Merge(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value);
void Merge(const Slice& key, const Slice& value) {
Merge(nullptr, key, value);
}
// If the database contains a mapping for "key", erase it. Else do nothing.
void Delete(ColumnFamilyHandle* column_family, const Slice& key);
void Delete(const Slice& key) { Delete(nullptr, key); }
// variant that takes SliceParts
void Delete(ColumnFamilyHandle* column_family, const SliceParts& key);
void Delete(const SliceParts& key) { Delete(nullptr, key); }
// Append a blob of arbitrary size to the records in this batch. The blob will
// be stored in the transaction log but not in any other file. In particular,
// it will not be persisted to the SST files. When iterating over this
// WriteBatch, WriteBatch::Handler::LogData will be called with the contents
// of the blob as it is encountered. Blobs, puts, deletes, and merges will be
// encountered in the same order in thich they were inserted. The blob will
// NOT consume sequence number(s) and will NOT increase the count of the batch
//
// Example application: add timestamps to the transaction log for use in
// replication.
void PutLogData(const Slice& blob);
// Clear all updates buffered in this batch.
void Clear();
// Support for iterating over the contents of a batch.
class Handler {
public:
virtual ~Handler();
// default implementation will just call Put without column family for
// backwards compatibility. If the column family is not default,
// the function is noop
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) {
if (column_family_id == 0) {
// Put() historically doesn't return status. We didn't want to be
// backwards incompatible so we didn't change the return status
// (this is a public API). We do an ordinary get and return Status::OK()
Put(key, value);
return Status::OK();
}
return Status::InvalidArgument(
"non-default column family and PutCF not implemented");
}
virtual void Put(const Slice& key, const Slice& value);
// Merge and LogData are not pure virtual. Otherwise, we would break
// existing clients of Handler on a source code level. The default
// implementation of Merge simply throws a runtime exception.
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
const Slice& value) {
if (column_family_id == 0) {
Merge(key, value);
return Status::OK();
}
return Status::InvalidArgument(
"non-default column family and MergeCF not implemented");
}
virtual void Merge(const Slice& key, const Slice& value);
// The default implementation of LogData does nothing.
virtual void LogData(const Slice& blob);
virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
if (column_family_id == 0) {
Delete(key);
return Status::OK();
}
return Status::InvalidArgument(
"non-default column family and DeleteCF not implemented");
}
virtual void Delete(const Slice& key);
// Continue is called by WriteBatch::Iterate. If it returns false,
// iteration is halted. Otherwise, it continues iterating. The default
// implementation always returns true.
virtual bool Continue();
};
Status Iterate(Handler* handler) const;
// Retrieve the serialized version of this batch.
const std::string& Data() const { return rep_; }
// Retrieve data size of the batch.
size_t GetDataSize() const { return rep_.size(); }
// Returns the number of updates in the batch
int Count() const;
// Constructor with a serialized string object
explicit WriteBatch(std::string rep): rep_(rep) {}
private:
friend class WriteBatchInternal;
std::string rep_; // See comment in write_batch.cc for the format of rep_
// Intentionally copyable
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_

View File

@@ -0,0 +1,12 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#warning This file was moved to rocksdb/utilities/backupable_db.h
#include "rocksdb/utilities/backupable_db.h"

View File

@@ -0,0 +1,8 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#warning This file was moved to rocksdb/utilities/db_ttl.h
#include "rocksdb/utilities/db_ttl.h"

View File

@@ -0,0 +1,8 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#warning This file was moved to rocksdb/utilities/document_db.h
#include "rocksdb/utilities/document_db.h"

View File

@@ -0,0 +1,8 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#warning This file was moved to rocksdb/utilities/geo_db.h
#include "rocksdb/utilities/geo_db.h"

View File

@@ -0,0 +1,7 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#warning This file was moved to rocksdb/utilities/json_document.h
#include "rocksdb/utilities/json_document.h"

View File

@@ -0,0 +1,7 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#warning This file was moved to rocksdb/utilities/stackable_db.h
#include "rocksdb/utilities/stackable_db.h"

View File

@@ -0,0 +1,7 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#warning This file was moved to rocksdb/utilities/utility_db.h
#include "rocksdb/utilities/utility_db.h"