mirror of
https://github.com/Xahau/xahaud.git
synced 2025-12-06 17:27:52 +00:00
Squashed 'src/rocksdb/' content from commit 224932d
git-subtree-dir: src/rocksdb git-subtree-split: 224932d4d0b561712107d747c662df181c39644d
This commit is contained in:
788
include/rocksdb/c.h
Normal file
788
include/rocksdb/c.h
Normal file
@@ -0,0 +1,788 @@
|
||||
/* Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
This source code is licensed under the BSD-style license found in the
|
||||
LICENSE file in the root directory of this source tree. An additional grant
|
||||
of patent rights can be found in the PATENTS file in the same directory.
|
||||
Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
Use of this source code is governed by a BSD-style license that can be
|
||||
found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
C bindings for rocksdb. May be useful as a stable ABI that can be
|
||||
used by programs that keep rocksdb in a shared library, or for
|
||||
a JNI api.
|
||||
|
||||
Does not support:
|
||||
. getters for the option types
|
||||
. custom comparators that implement key shortening
|
||||
. capturing post-write-snapshot
|
||||
. custom iter, db, env, cache implementations using just the C bindings
|
||||
|
||||
Some conventions:
|
||||
|
||||
(1) We expose just opaque struct pointers and functions to clients.
|
||||
This allows us to change internal representations without having to
|
||||
recompile clients.
|
||||
|
||||
(2) For simplicity, there is no equivalent to the Slice type. Instead,
|
||||
the caller has to pass the pointer and length as separate
|
||||
arguments.
|
||||
|
||||
(3) Errors are represented by a null-terminated c string. NULL
|
||||
means no error. All operations that can raise an error are passed
|
||||
a "char** errptr" as the last argument. One of the following must
|
||||
be true on entry:
|
||||
*errptr == NULL
|
||||
*errptr points to a malloc()ed null-terminated error message
|
||||
On success, a leveldb routine leaves *errptr unchanged.
|
||||
On failure, leveldb frees the old value of *errptr and
|
||||
set *errptr to a malloc()ed error message.
|
||||
|
||||
(4) Bools have the type unsigned char (0 == false; rest == true)
|
||||
|
||||
(5) All of the pointer arguments must be non-NULL.
|
||||
*/
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_C_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* Exported types */
|
||||
|
||||
typedef struct rocksdb_t rocksdb_t;
|
||||
typedef struct rocksdb_cache_t rocksdb_cache_t;
|
||||
typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
|
||||
typedef struct rocksdb_compactionfiltercontext_t
|
||||
rocksdb_compactionfiltercontext_t;
|
||||
typedef struct rocksdb_compactionfilterfactory_t
|
||||
rocksdb_compactionfilterfactory_t;
|
||||
typedef struct rocksdb_compactionfilterv2_t
|
||||
rocksdb_compactionfilterv2_t;
|
||||
typedef struct rocksdb_compactionfilterfactoryv2_t
|
||||
rocksdb_compactionfilterfactoryv2_t;
|
||||
typedef struct rocksdb_comparator_t rocksdb_comparator_t;
|
||||
typedef struct rocksdb_env_t rocksdb_env_t;
|
||||
typedef struct rocksdb_fifo_compaction_options_t rocksdb_fifo_compaction_options_t;
|
||||
typedef struct rocksdb_filelock_t rocksdb_filelock_t;
|
||||
typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t;
|
||||
typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t;
|
||||
typedef struct rocksdb_iterator_t rocksdb_iterator_t;
|
||||
typedef struct rocksdb_logger_t rocksdb_logger_t;
|
||||
typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t;
|
||||
typedef struct rocksdb_options_t rocksdb_options_t;
|
||||
typedef struct rocksdb_randomfile_t rocksdb_randomfile_t;
|
||||
typedef struct rocksdb_readoptions_t rocksdb_readoptions_t;
|
||||
typedef struct rocksdb_seqfile_t rocksdb_seqfile_t;
|
||||
typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t;
|
||||
typedef struct rocksdb_snapshot_t rocksdb_snapshot_t;
|
||||
typedef struct rocksdb_writablefile_t rocksdb_writablefile_t;
|
||||
typedef struct rocksdb_writebatch_t rocksdb_writebatch_t;
|
||||
typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t;
|
||||
typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t;
|
||||
typedef struct rocksdb_livefiles_t rocksdb_livefiles_t;
|
||||
typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t;
|
||||
|
||||
/* DB operations */
|
||||
|
||||
extern rocksdb_t* rocksdb_open(
|
||||
const rocksdb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr);
|
||||
|
||||
extern rocksdb_t* rocksdb_open_for_read_only(
|
||||
const rocksdb_options_t* options,
|
||||
const char* name,
|
||||
unsigned char error_if_log_file_exist,
|
||||
char** errptr);
|
||||
|
||||
extern rocksdb_t* rocksdb_open_column_families(
|
||||
const rocksdb_options_t* options,
|
||||
const char* name,
|
||||
int num_column_families,
|
||||
const char** column_family_names,
|
||||
const rocksdb_options_t** column_family_options,
|
||||
rocksdb_column_family_handle_t** column_family_handles,
|
||||
char** errptr);
|
||||
|
||||
extern rocksdb_t* rocksdb_open_for_read_only_column_families(
|
||||
const rocksdb_options_t* options,
|
||||
const char* name,
|
||||
int num_column_families,
|
||||
const char** column_family_names,
|
||||
const rocksdb_options_t** column_family_options,
|
||||
rocksdb_column_family_handle_t** column_family_handles,
|
||||
unsigned char error_if_log_file_exist,
|
||||
char** errptr);
|
||||
|
||||
char** rocksdb_list_column_families(
|
||||
const rocksdb_options_t* options,
|
||||
const char* name,
|
||||
size_t* lencf,
|
||||
char** errptr);
|
||||
void rocksdb_list_column_families_destroy(char** list, size_t len);
|
||||
|
||||
extern rocksdb_column_family_handle_t* rocksdb_create_column_family(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_options_t* column_family_options,
|
||||
const char* column_family_name,
|
||||
char** errptr);
|
||||
|
||||
extern void rocksdb_drop_column_family(
|
||||
rocksdb_t* db,
|
||||
rocksdb_column_family_handle_t* handle,
|
||||
char** errptr);
|
||||
|
||||
extern void rocksdb_column_family_handle_destroy(rocksdb_column_family_handle_t*);
|
||||
|
||||
extern void rocksdb_close(rocksdb_t* db);
|
||||
|
||||
extern void rocksdb_put(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
const char* val, size_t vallen,
|
||||
char** errptr);
|
||||
|
||||
extern void rocksdb_put_cf(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_writeoptions_t* options,
|
||||
rocksdb_column_family_handle_t* column_family,
|
||||
const char* key, size_t keylen,
|
||||
const char* val, size_t vallen,
|
||||
char** errptr);
|
||||
|
||||
extern void rocksdb_delete(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
char** errptr);
|
||||
|
||||
void rocksdb_delete_cf(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_writeoptions_t* options,
|
||||
rocksdb_column_family_handle_t* column_family,
|
||||
const char* key, size_t keylen,
|
||||
char** errptr);
|
||||
|
||||
extern void rocksdb_merge(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
const char* val, size_t vallen,
|
||||
char** errptr);
|
||||
|
||||
extern void rocksdb_merge_cf(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_writeoptions_t* options,
|
||||
rocksdb_column_family_handle_t* column_family,
|
||||
const char* key, size_t keylen,
|
||||
const char* val, size_t vallen,
|
||||
char** errptr);
|
||||
|
||||
extern void rocksdb_write(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_writeoptions_t* options,
|
||||
rocksdb_writebatch_t* batch,
|
||||
char** errptr);
|
||||
|
||||
/* Returns NULL if not found. A malloc()ed array otherwise.
|
||||
Stores the length of the array in *vallen. */
|
||||
extern char* rocksdb_get(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_readoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
size_t* vallen,
|
||||
char** errptr);
|
||||
|
||||
extern char* rocksdb_get_cf(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_readoptions_t* options,
|
||||
rocksdb_column_family_handle_t* column_family,
|
||||
const char* key, size_t keylen,
|
||||
size_t* vallen,
|
||||
char** errptr);
|
||||
|
||||
extern rocksdb_iterator_t* rocksdb_create_iterator(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_readoptions_t* options);
|
||||
|
||||
extern rocksdb_iterator_t* rocksdb_create_iterator_cf(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_readoptions_t* options,
|
||||
rocksdb_column_family_handle_t* column_family);
|
||||
|
||||
extern const rocksdb_snapshot_t* rocksdb_create_snapshot(
|
||||
rocksdb_t* db);
|
||||
|
||||
extern void rocksdb_release_snapshot(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_snapshot_t* snapshot);
|
||||
|
||||
/* Returns NULL if property name is unknown.
|
||||
Else returns a pointer to a malloc()-ed null-terminated value. */
|
||||
extern char* rocksdb_property_value(
|
||||
rocksdb_t* db,
|
||||
const char* propname);
|
||||
|
||||
extern char* rocksdb_property_value_cf(
|
||||
rocksdb_t* db,
|
||||
rocksdb_column_family_handle_t* column_family,
|
||||
const char* propname);
|
||||
|
||||
extern void rocksdb_approximate_sizes(
|
||||
rocksdb_t* db,
|
||||
int num_ranges,
|
||||
const char* const* range_start_key, const size_t* range_start_key_len,
|
||||
const char* const* range_limit_key, const size_t* range_limit_key_len,
|
||||
uint64_t* sizes);
|
||||
|
||||
extern void rocksdb_approximate_sizes_cf(
|
||||
rocksdb_t* db,
|
||||
rocksdb_column_family_handle_t* column_family,
|
||||
int num_ranges,
|
||||
const char* const* range_start_key, const size_t* range_start_key_len,
|
||||
const char* const* range_limit_key, const size_t* range_limit_key_len,
|
||||
uint64_t* sizes);
|
||||
|
||||
extern void rocksdb_compact_range(
|
||||
rocksdb_t* db,
|
||||
const char* start_key, size_t start_key_len,
|
||||
const char* limit_key, size_t limit_key_len);
|
||||
|
||||
extern void rocksdb_compact_range_cf(
|
||||
rocksdb_t* db,
|
||||
rocksdb_column_family_handle_t* column_family,
|
||||
const char* start_key, size_t start_key_len,
|
||||
const char* limit_key, size_t limit_key_len);
|
||||
|
||||
extern void rocksdb_delete_file(
|
||||
rocksdb_t* db,
|
||||
const char* name);
|
||||
|
||||
extern const rocksdb_livefiles_t* rocksdb_livefiles(
|
||||
rocksdb_t* db);
|
||||
|
||||
extern void rocksdb_flush(
|
||||
rocksdb_t* db,
|
||||
const rocksdb_flushoptions_t* options,
|
||||
char** errptr);
|
||||
|
||||
extern void rocksdb_disable_file_deletions(
|
||||
rocksdb_t* db,
|
||||
char** errptr);
|
||||
|
||||
extern void rocksdb_enable_file_deletions(
|
||||
rocksdb_t* db,
|
||||
unsigned char force,
|
||||
char** errptr);
|
||||
|
||||
/* Management operations */
|
||||
|
||||
extern void rocksdb_destroy_db(
|
||||
const rocksdb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr);
|
||||
|
||||
extern void rocksdb_repair_db(
|
||||
const rocksdb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr);
|
||||
|
||||
/* Iterator */
|
||||
|
||||
extern void rocksdb_iter_destroy(rocksdb_iterator_t*);
|
||||
extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*);
|
||||
extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
|
||||
extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
|
||||
extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen);
|
||||
extern void rocksdb_iter_next(rocksdb_iterator_t*);
|
||||
extern void rocksdb_iter_prev(rocksdb_iterator_t*);
|
||||
extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen);
|
||||
extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen);
|
||||
extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr);
|
||||
|
||||
/* Write batch */
|
||||
|
||||
extern rocksdb_writebatch_t* rocksdb_writebatch_create();
|
||||
extern rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
|
||||
size_t size);
|
||||
extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*);
|
||||
extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
|
||||
extern int rocksdb_writebatch_count(rocksdb_writebatch_t*);
|
||||
extern void rocksdb_writebatch_put(
|
||||
rocksdb_writebatch_t*,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen);
|
||||
extern void rocksdb_writebatch_put_cf(
|
||||
rocksdb_writebatch_t*,
|
||||
rocksdb_column_family_handle_t* column_family,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen);
|
||||
extern void rocksdb_writebatch_merge(
|
||||
rocksdb_writebatch_t*,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen);
|
||||
extern void rocksdb_writebatch_merge_cf(
|
||||
rocksdb_writebatch_t*,
|
||||
rocksdb_column_family_handle_t* column_family,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen);
|
||||
extern void rocksdb_writebatch_delete(
|
||||
rocksdb_writebatch_t*,
|
||||
const char* key, size_t klen);
|
||||
extern void rocksdb_writebatch_delete_cf(
|
||||
rocksdb_writebatch_t*,
|
||||
rocksdb_column_family_handle_t* column_family,
|
||||
const char* key, size_t klen);
|
||||
extern void rocksdb_writebatch_iterate(
|
||||
rocksdb_writebatch_t*,
|
||||
void* state,
|
||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
|
||||
void (*deleted)(void*, const char* k, size_t klen));
|
||||
extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size);
|
||||
|
||||
/* Options */
|
||||
|
||||
extern rocksdb_options_t* rocksdb_options_create();
|
||||
extern void rocksdb_options_destroy(rocksdb_options_t*);
|
||||
extern void rocksdb_options_increase_parallelism(
|
||||
rocksdb_options_t* opt, int total_threads);
|
||||
extern void rocksdb_options_optimize_for_point_lookup(
|
||||
rocksdb_options_t* opt);
|
||||
extern void rocksdb_options_optimize_level_style_compaction(
|
||||
rocksdb_options_t* opt, uint64_t memtable_memory_budget);
|
||||
extern void rocksdb_options_optimize_universal_style_compaction(
|
||||
rocksdb_options_t* opt, uint64_t memtable_memory_budget);
|
||||
extern void rocksdb_options_set_compaction_filter(
|
||||
rocksdb_options_t*,
|
||||
rocksdb_compactionfilter_t*);
|
||||
extern void rocksdb_options_set_compaction_filter_factory(
|
||||
rocksdb_options_t*, rocksdb_compactionfilterfactory_t*);
|
||||
extern void rocksdb_options_set_compaction_filter_factory_v2(
|
||||
rocksdb_options_t*,
|
||||
rocksdb_compactionfilterfactoryv2_t*);
|
||||
extern void rocksdb_options_set_comparator(
|
||||
rocksdb_options_t*,
|
||||
rocksdb_comparator_t*);
|
||||
extern void rocksdb_options_set_merge_operator(
|
||||
rocksdb_options_t*,
|
||||
rocksdb_mergeoperator_t*);
|
||||
extern void rocksdb_options_set_compression_per_level(
|
||||
rocksdb_options_t* opt,
|
||||
int* level_values,
|
||||
size_t num_levels);
|
||||
extern void rocksdb_options_set_filter_policy(
|
||||
rocksdb_options_t*,
|
||||
rocksdb_filterpolicy_t*);
|
||||
extern void rocksdb_options_set_create_if_missing(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_create_missing_column_families(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_error_if_exists(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_paranoid_checks(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*);
|
||||
extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
|
||||
extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
|
||||
extern void rocksdb_options_set_cache_compressed(rocksdb_options_t*, rocksdb_cache_t*);
|
||||
extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_compression_options(
|
||||
rocksdb_options_t*, int, int, int);
|
||||
extern void rocksdb_options_set_whole_key_filtering(rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_prefix_extractor(
|
||||
rocksdb_options_t*, rocksdb_slicetransform_t*);
|
||||
extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_level0_file_num_compaction_trigger(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_level0_slowdown_writes_trigger(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_level0_stop_writes_trigger(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_max_mem_compaction_level(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_target_file_size_base(
|
||||
rocksdb_options_t*, uint64_t);
|
||||
extern void rocksdb_options_set_target_file_size_multiplier(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_max_bytes_for_level_base(
|
||||
rocksdb_options_t*, uint64_t);
|
||||
extern void rocksdb_options_set_max_bytes_for_level_multiplier(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_expanded_compaction_factor(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_max_grandparent_overlap_factor(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
|
||||
rocksdb_options_t*, int* level_values, size_t num_levels);
|
||||
extern void rocksdb_options_enable_statistics(rocksdb_options_t*);
|
||||
|
||||
extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_max_log_file_size(rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_keep_log_file_num(rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_soft_rate_limit(rocksdb_options_t*, double);
|
||||
extern void rocksdb_options_set_hard_rate_limit(rocksdb_options_t*, double);
|
||||
extern void rocksdb_options_set_rate_limit_delay_max_milliseconds(
|
||||
rocksdb_options_t*, unsigned int);
|
||||
extern void rocksdb_options_set_max_manifest_file_size(
|
||||
rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_no_block_cache(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_table_cache_numshardbits(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_table_cache_remove_scan_count_limit(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_arena_block_size(
|
||||
rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_use_fsync(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_db_stats_log_interval(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_db_log_dir(
|
||||
rocksdb_options_t*, const char*);
|
||||
extern void rocksdb_options_set_wal_dir(
|
||||
rocksdb_options_t*, const char*);
|
||||
extern void rocksdb_options_set_WAL_ttl_seconds(
|
||||
rocksdb_options_t*, uint64_t);
|
||||
extern void rocksdb_options_set_WAL_size_limit_MB(
|
||||
rocksdb_options_t*, uint64_t);
|
||||
extern void rocksdb_options_set_manifest_preallocation_size(
|
||||
rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_purge_redundant_kvs_while_flush(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_allow_os_buffer(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_allow_mmap_reads(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_allow_mmap_writes(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_is_fd_close_on_exec(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_skip_log_error_on_recovery(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_stats_dump_period_sec(
|
||||
rocksdb_options_t*, unsigned int);
|
||||
extern void rocksdb_options_set_block_size_deviation(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_advise_random_on_open(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_access_hint_on_compaction_start(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_use_adaptive_mutex(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_bytes_per_sync(
|
||||
rocksdb_options_t*, uint64_t);
|
||||
extern void rocksdb_options_set_verify_checksums_in_compaction(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_filter_deletes(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_max_sequential_skip_in_iterations(
|
||||
rocksdb_options_t*, uint64_t);
|
||||
extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_delete_obsolete_files_period_micros(
|
||||
rocksdb_options_t*, uint64_t);
|
||||
extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*);
|
||||
extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*);
|
||||
extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t);
|
||||
extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t);
|
||||
|
||||
extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n);
|
||||
extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec);
|
||||
|
||||
extern void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level);
|
||||
|
||||
extern void rocksdb_options_set_memtable_prefix_bloom_bits(
|
||||
rocksdb_options_t*, uint32_t);
|
||||
extern void rocksdb_options_set_memtable_prefix_bloom_probes(
|
||||
rocksdb_options_t*, uint32_t);
|
||||
extern void rocksdb_options_set_max_successive_merges(
|
||||
rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_min_partial_merge_operands(
|
||||
rocksdb_options_t*, uint32_t);
|
||||
extern void rocksdb_options_set_bloom_locality(
|
||||
rocksdb_options_t*, uint32_t);
|
||||
extern void rocksdb_options_set_allow_thread_local(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_inplace_update_support(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_inplace_update_num_locks(
|
||||
rocksdb_options_t*, size_t);
|
||||
|
||||
enum {
|
||||
rocksdb_no_compression = 0,
|
||||
rocksdb_snappy_compression = 1,
|
||||
rocksdb_zlib_compression = 2,
|
||||
rocksdb_bz2_compression = 3,
|
||||
rocksdb_lz4_compression = 4,
|
||||
rocksdb_lz4hc_compression = 5
|
||||
};
|
||||
extern void rocksdb_options_set_compression(rocksdb_options_t*, int);
|
||||
|
||||
enum {
|
||||
rocksdb_level_compaction = 0,
|
||||
rocksdb_universal_compaction = 1,
|
||||
rocksdb_fifo_compaction = 2
|
||||
};
|
||||
extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
|
||||
extern void rocksdb_options_set_fifo_compaction_options(rocksdb_options_t* opt,
|
||||
rocksdb_fifo_compaction_options_t* fifo);
|
||||
|
||||
/* Compaction Filter */
|
||||
|
||||
extern rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
unsigned char (*filter)(
|
||||
void*,
|
||||
int level,
|
||||
const char* key, size_t key_length,
|
||||
const char* existing_value, size_t value_length,
|
||||
char** new_value, size_t *new_value_length,
|
||||
unsigned char* value_changed),
|
||||
const char* (*name)(void*));
|
||||
extern void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t*);
|
||||
|
||||
/* Compaction Filter Context */
|
||||
|
||||
extern unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
|
||||
rocksdb_compactionfiltercontext_t* context);
|
||||
|
||||
extern unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
|
||||
rocksdb_compactionfiltercontext_t* context);
|
||||
|
||||
/* Compaction Filter Factory */
|
||||
|
||||
extern rocksdb_compactionfilterfactory_t*
|
||||
rocksdb_compactionfilterfactory_create(
|
||||
void* state, void (*destructor)(void*),
|
||||
rocksdb_compactionfilter_t* (*create_compaction_filter)(
|
||||
void*, rocksdb_compactionfiltercontext_t* context),
|
||||
const char* (*name)(void*));
|
||||
extern void rocksdb_compactionfilterfactory_destroy(
|
||||
rocksdb_compactionfilterfactory_t*);
|
||||
|
||||
/* Compaction Filter V2 */
|
||||
|
||||
extern rocksdb_compactionfilterv2_t* rocksdb_compactionfilterv2_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
// num_keys specifies the number of array entries in every *list parameter.
|
||||
// New values added to the new_values_list should be malloc'd and will be
|
||||
// freed by the caller. Specify true in the to_delete_list to remove an
|
||||
// entry during compaction; false to keep it.
|
||||
void (*filter)(
|
||||
void*, int level, size_t num_keys,
|
||||
const char* const* keys_list, const size_t* keys_list_sizes,
|
||||
const char* const* existing_values_list, const size_t* existing_values_list_sizes,
|
||||
char** new_values_list, size_t* new_values_list_sizes,
|
||||
unsigned char* to_delete_list),
|
||||
const char* (*name)(void*));
|
||||
extern void rocksdb_compactionfilterv2_destroy(rocksdb_compactionfilterv2_t*);
|
||||
|
||||
/* Compaction Filter Factory V2 */
|
||||
|
||||
extern rocksdb_compactionfilterfactoryv2_t* rocksdb_compactionfilterfactoryv2_create(
|
||||
void* state,
|
||||
rocksdb_slicetransform_t* prefix_extractor,
|
||||
void (*destructor)(void*),
|
||||
rocksdb_compactionfilterv2_t* (*create_compaction_filter_v2)(
|
||||
void*, const rocksdb_compactionfiltercontext_t* context),
|
||||
const char* (*name)(void*));
|
||||
extern void rocksdb_compactionfilterfactoryv2_destroy(rocksdb_compactionfilterfactoryv2_t*);
|
||||
|
||||
/* Comparator */
|
||||
|
||||
extern rocksdb_comparator_t* rocksdb_comparator_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
int (*compare)(
|
||||
void*,
|
||||
const char* a, size_t alen,
|
||||
const char* b, size_t blen),
|
||||
const char* (*name)(void*));
|
||||
extern void rocksdb_comparator_destroy(rocksdb_comparator_t*);
|
||||
|
||||
/* Filter policy */
|
||||
|
||||
extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
char* (*create_filter)(
|
||||
void*,
|
||||
const char* const* key_array, const size_t* key_length_array,
|
||||
int num_keys,
|
||||
size_t* filter_length),
|
||||
unsigned char (*key_may_match)(
|
||||
void*,
|
||||
const char* key, size_t length,
|
||||
const char* filter, size_t filter_length),
|
||||
void (*delete_filter)(
|
||||
void*,
|
||||
const char* filter, size_t filter_length),
|
||||
const char* (*name)(void*));
|
||||
extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*);
|
||||
|
||||
extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(
|
||||
int bits_per_key);
|
||||
|
||||
/* Merge Operator */
|
||||
|
||||
extern rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
char* (*full_merge)(
|
||||
void*,
|
||||
const char* key, size_t key_length,
|
||||
const char* existing_value, size_t existing_value_length,
|
||||
const char* const* operands_list, const size_t* operands_list_length,
|
||||
int num_operands,
|
||||
unsigned char* success, size_t* new_value_length),
|
||||
char* (*partial_merge)(
|
||||
void*,
|
||||
const char* key, size_t key_length,
|
||||
const char* const* operands_list, const size_t* operands_list_length,
|
||||
int num_operands,
|
||||
unsigned char* success, size_t* new_value_length),
|
||||
void (*delete_value)(
|
||||
void*,
|
||||
const char* value, size_t value_length),
|
||||
const char* (*name)(void*));
|
||||
extern void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t*);
|
||||
|
||||
/* Read options */
|
||||
|
||||
extern rocksdb_readoptions_t* rocksdb_readoptions_create();
|
||||
extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*);
|
||||
extern void rocksdb_readoptions_set_verify_checksums(
|
||||
rocksdb_readoptions_t*,
|
||||
unsigned char);
|
||||
extern void rocksdb_readoptions_set_fill_cache(
|
||||
rocksdb_readoptions_t*, unsigned char);
|
||||
extern void rocksdb_readoptions_set_snapshot(
|
||||
rocksdb_readoptions_t*,
|
||||
const rocksdb_snapshot_t*);
|
||||
extern void rocksdb_readoptions_set_read_tier(
|
||||
rocksdb_readoptions_t*, int);
|
||||
extern void rocksdb_readoptions_set_tailing(
|
||||
rocksdb_readoptions_t*, unsigned char);
|
||||
|
||||
/* Write options */
|
||||
|
||||
extern rocksdb_writeoptions_t* rocksdb_writeoptions_create();
|
||||
extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*);
|
||||
extern void rocksdb_writeoptions_set_sync(
|
||||
rocksdb_writeoptions_t*, unsigned char);
|
||||
extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable);
|
||||
|
||||
/* Flush options */
|
||||
|
||||
extern rocksdb_flushoptions_t* rocksdb_flushoptions_create();
|
||||
extern void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t*);
|
||||
extern void rocksdb_flushoptions_set_wait(
|
||||
rocksdb_flushoptions_t*, unsigned char);
|
||||
|
||||
/* Cache */
|
||||
|
||||
extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity);
|
||||
extern void rocksdb_cache_destroy(rocksdb_cache_t* cache);
|
||||
|
||||
/* Env */
|
||||
|
||||
extern rocksdb_env_t* rocksdb_create_default_env();
|
||||
extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n);
|
||||
extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
|
||||
extern void rocksdb_env_destroy(rocksdb_env_t*);
|
||||
|
||||
/* SliceTransform */
|
||||
|
||||
extern rocksdb_slicetransform_t* rocksdb_slicetransform_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
char* (*transform)(
|
||||
void*,
|
||||
const char* key, size_t length,
|
||||
size_t* dst_length),
|
||||
unsigned char (*in_domain)(
|
||||
void*,
|
||||
const char* key, size_t length),
|
||||
unsigned char (*in_range)(
|
||||
void*,
|
||||
const char* key, size_t length),
|
||||
const char* (*name)(void*));
|
||||
extern rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t);
|
||||
extern void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t*);
|
||||
|
||||
/* Universal Compaction options */
|
||||
|
||||
enum {
|
||||
rocksdb_similar_size_compaction_stop_style = 0,
|
||||
rocksdb_total_size_compaction_stop_style = 1
|
||||
};
|
||||
|
||||
extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ;
|
||||
extern void rocksdb_universal_compaction_options_set_size_ratio(
|
||||
rocksdb_universal_compaction_options_t*, int);
|
||||
extern void rocksdb_universal_compaction_options_set_min_merge_width(
|
||||
rocksdb_universal_compaction_options_t*, int);
|
||||
extern void rocksdb_universal_compaction_options_set_max_merge_width(
|
||||
rocksdb_universal_compaction_options_t*, int);
|
||||
extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
|
||||
rocksdb_universal_compaction_options_t*, int);
|
||||
extern void rocksdb_universal_compaction_options_set_compression_size_percent(
|
||||
rocksdb_universal_compaction_options_t*, int);
|
||||
extern void rocksdb_universal_compaction_options_set_stop_style(
|
||||
rocksdb_universal_compaction_options_t*, int);
|
||||
extern void rocksdb_universal_compaction_options_destroy(
|
||||
rocksdb_universal_compaction_options_t*);
|
||||
|
||||
extern rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create();
|
||||
extern void rocksdb_fifo_compaction_options_set_max_table_files_size(
|
||||
rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
|
||||
extern void rocksdb_fifo_compaction_options_destroy(
|
||||
rocksdb_fifo_compaction_options_t* fifo_opts);
|
||||
|
||||
extern int rocksdb_livefiles_count(
|
||||
const rocksdb_livefiles_t*);
|
||||
extern const char* rocksdb_livefiles_name(
|
||||
const rocksdb_livefiles_t*,
|
||||
int index);
|
||||
extern int rocksdb_livefiles_level(
|
||||
const rocksdb_livefiles_t*,
|
||||
int index);
|
||||
extern size_t rocksdb_livefiles_size(
|
||||
const rocksdb_livefiles_t*,
|
||||
int index);
|
||||
extern const char* rocksdb_livefiles_smallestkey(
|
||||
const rocksdb_livefiles_t*,
|
||||
int index,
|
||||
size_t* size);
|
||||
extern const char* rocksdb_livefiles_largestkey(
|
||||
const rocksdb_livefiles_t*,
|
||||
int index,
|
||||
size_t* size);
|
||||
extern void rocksdb_livefiles_destroy(
|
||||
const rocksdb_livefiles_t*);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* end extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* STORAGE_ROCKSDB_INCLUDE_C_H_ */
|
||||
140
include/rocksdb/cache.h
Normal file
140
include/rocksdb/cache.h
Normal file
@@ -0,0 +1,140 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A Cache is an interface that maps keys to values. It has internal
|
||||
// synchronization and may be safely accessed concurrently from
|
||||
// multiple threads. It may automatically evict entries to make room
|
||||
// for new entries. Values have a specified charge against the cache
|
||||
// capacity. For example, a cache where the values are variable
|
||||
// length strings, may use the length of the string as the charge for
|
||||
// the string.
|
||||
//
|
||||
// A builtin cache implementation with a least-recently-used eviction
|
||||
// policy is provided. Clients may use their own implementations if
|
||||
// they want something more sophisticated (like scan-resistance, a
|
||||
// custom eviction policy, variable cache sizing, etc.)
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
|
||||
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
using std::shared_ptr;
|
||||
|
||||
class Cache;
|
||||
|
||||
// Create a new cache with a fixed size capacity. The cache is sharded
|
||||
// to 2^numShardBits shards, by hash of the key. The total capacity
|
||||
// is divided and evenly assigned to each shard. Inside each shard,
|
||||
// the eviction is done in two passes: first try to free spaces by
|
||||
// evicting entries that are among the most least used removeScanCountLimit
|
||||
// entries and do not have reference other than by the cache itself, in
|
||||
// the least-used order. If not enough space is freed, further free the
|
||||
// entries in least used order.
|
||||
//
|
||||
// The functions without parameter numShardBits and/or removeScanCountLimit
|
||||
// use default values. removeScanCountLimit's default value is 0, which
|
||||
// means a strict LRU order inside each shard.
|
||||
extern shared_ptr<Cache> NewLRUCache(size_t capacity);
|
||||
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
|
||||
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
|
||||
int removeScanCountLimit);
|
||||
|
||||
class Cache {
|
||||
public:
|
||||
Cache() { }
|
||||
|
||||
// Destroys all existing entries by calling the "deleter"
|
||||
// function that was passed to the constructor.
|
||||
virtual ~Cache();
|
||||
|
||||
// Opaque handle to an entry stored in the cache.
|
||||
struct Handle { };
|
||||
|
||||
// Insert a mapping from key->value into the cache and assign it
|
||||
// the specified charge against the total cache capacity.
|
||||
//
|
||||
// Returns a handle that corresponds to the mapping. The caller
|
||||
// must call this->Release(handle) when the returned mapping is no
|
||||
// longer needed.
|
||||
//
|
||||
// When the inserted entry is no longer needed, the key and
|
||||
// value will be passed to "deleter".
|
||||
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
|
||||
void (*deleter)(const Slice& key, void* value)) = 0;
|
||||
|
||||
// If the cache has no mapping for "key", returns nullptr.
|
||||
//
|
||||
// Else return a handle that corresponds to the mapping. The caller
|
||||
// must call this->Release(handle) when the returned mapping is no
|
||||
// longer needed.
|
||||
virtual Handle* Lookup(const Slice& key) = 0;
|
||||
|
||||
// Release a mapping returned by a previous Lookup().
|
||||
// REQUIRES: handle must not have been released yet.
|
||||
// REQUIRES: handle must have been returned by a method on *this.
|
||||
virtual void Release(Handle* handle) = 0;
|
||||
|
||||
// Return the value encapsulated in a handle returned by a
|
||||
// successful Lookup().
|
||||
// REQUIRES: handle must not have been released yet.
|
||||
// REQUIRES: handle must have been returned by a method on *this.
|
||||
virtual void* Value(Handle* handle) = 0;
|
||||
|
||||
// If the cache contains entry for key, erase it. Note that the
|
||||
// underlying entry will be kept around until all existing handles
|
||||
// to it have been released.
|
||||
virtual void Erase(const Slice& key) = 0;
|
||||
|
||||
// Return a new numeric id. May be used by multiple clients who are
|
||||
// sharing the same cache to partition the key space. Typically the
|
||||
// client will allocate a new id at startup and prepend the id to
|
||||
// its cache keys.
|
||||
virtual uint64_t NewId() = 0;
|
||||
|
||||
// returns the maximum configured capacity of the cache
|
||||
virtual size_t GetCapacity() const = 0;
|
||||
|
||||
// returns the memory size for the entries residing in the cache.
|
||||
virtual size_t GetUsage() const = 0;
|
||||
|
||||
// Call this on shutdown if you want to speed it up. Cache will disown
|
||||
// any underlying data and will not free it on delete. This call will leak
|
||||
// memory - call this only if you're shutting down the process.
|
||||
// Any attempts of using cache after this call will fail terribly.
|
||||
// Always delete the DB object before calling this method!
|
||||
virtual void DisownData() {
|
||||
// default implementation is noop
|
||||
};
|
||||
|
||||
// Apply callback to all entries in the cache
|
||||
// If thread_safe is true, it will also lock the accesses. Otherwise, it will
|
||||
// access the cache without the lock held
|
||||
virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
|
||||
bool thread_safe) = 0;
|
||||
|
||||
private:
|
||||
void LRU_Remove(Handle* e);
|
||||
void LRU_Append(Handle* e);
|
||||
void Unref(Handle* e);
|
||||
|
||||
struct Rep;
|
||||
Rep* rep_;
|
||||
|
||||
// No copying allowed
|
||||
Cache(const Cache&);
|
||||
void operator=(const Cache&);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_UTIL_CACHE_H_
|
||||
198
include/rocksdb/compaction_filter.h
Normal file
198
include/rocksdb/compaction_filter.h
Normal file
@@ -0,0 +1,198 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
class SliceTransform;
|
||||
|
||||
// Context information of a compaction run
|
||||
struct CompactionFilterContext {
|
||||
// Does this compaction run include all data files
|
||||
bool is_full_compaction;
|
||||
// Is this compaction requested by the client (true),
|
||||
// or is it occurring as an automatic compaction process
|
||||
bool is_manual_compaction;
|
||||
};
|
||||
|
||||
// CompactionFilter allows an application to modify/delete a key-value at
|
||||
// the time of compaction.
|
||||
|
||||
class CompactionFilter {
|
||||
public:
|
||||
// Context information of a compaction run
|
||||
struct Context {
|
||||
// Does this compaction run include all data files
|
||||
bool is_full_compaction;
|
||||
// Is this compaction requested by the client (true),
|
||||
// or is it occurring as an automatic compaction process
|
||||
bool is_manual_compaction;
|
||||
};
|
||||
|
||||
virtual ~CompactionFilter() {}
|
||||
|
||||
// The compaction process invokes this
|
||||
// method for kv that is being compacted. A return value
|
||||
// of false indicates that the kv should be preserved in the
|
||||
// output of this compaction run and a return value of true
|
||||
// indicates that this key-value should be removed from the
|
||||
// output of the compaction. The application can inspect
|
||||
// the existing value of the key and make decision based on it.
|
||||
//
|
||||
// When the value is to be preserved, the application has the option
|
||||
// to modify the existing_value and pass it back through new_value.
|
||||
// value_changed needs to be set to true in this case.
|
||||
//
|
||||
// If multithreaded compaction is being used *and* a single CompactionFilter
|
||||
// instance was supplied via Options::compaction_filter, this method may be
|
||||
// called from different threads concurrently. The application must ensure
|
||||
// that the call is thread-safe.
|
||||
//
|
||||
// If the CompactionFilter was created by a factory, then it will only ever
|
||||
// be used by a single thread that is doing the compaction run, and this
|
||||
// call does not need to be thread-safe. However, multiple filters may be
|
||||
// in existence and operating concurrently.
|
||||
virtual bool Filter(int level,
|
||||
const Slice& key,
|
||||
const Slice& existing_value,
|
||||
std::string* new_value,
|
||||
bool* value_changed) const = 0;
|
||||
|
||||
// Returns a name that identifies this compaction filter.
|
||||
// The name will be printed to LOG file on start up for diagnosis.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// CompactionFilterV2 that buffers kv pairs sharing the same prefix and let
|
||||
// application layer to make individual decisions for all the kv pairs in the
|
||||
// buffer.
|
||||
class CompactionFilterV2 {
|
||||
public:
|
||||
virtual ~CompactionFilterV2() {}
|
||||
|
||||
// The compaction process invokes this method for all the kv pairs
|
||||
// sharing the same prefix. It is a "roll-up" version of CompactionFilter.
|
||||
//
|
||||
// Each entry in the return vector indicates if the corresponding kv should
|
||||
// be preserved in the output of this compaction run. The application can
|
||||
// inspect the existing values of the keys and make decision based on it.
|
||||
//
|
||||
// When a value is to be preserved, the application has the option
|
||||
// to modify the entry in existing_values and pass it back through an entry
|
||||
// in new_values. A corresponding values_changed entry needs to be set to
|
||||
// true in this case. Note that the new_values vector contains only changed
|
||||
// values, i.e. new_values.size() <= values_changed.size().
|
||||
//
|
||||
typedef std::vector<Slice> SliceVector;
|
||||
virtual std::vector<bool> Filter(int level,
|
||||
const SliceVector& keys,
|
||||
const SliceVector& existing_values,
|
||||
std::vector<std::string>* new_values,
|
||||
std::vector<bool>* values_changed)
|
||||
const = 0;
|
||||
|
||||
// Returns a name that identifies this compaction filter.
|
||||
// The name will be printed to LOG file on start up for diagnosis.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// Each compaction will create a new CompactionFilter allowing the
|
||||
// application to know about different compactions
|
||||
class CompactionFilterFactory {
|
||||
public:
|
||||
virtual ~CompactionFilterFactory() { }
|
||||
|
||||
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
|
||||
const CompactionFilter::Context& context) = 0;
|
||||
|
||||
// Returns a name that identifies this compaction filter factory.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// Default implementation of CompactionFilterFactory which does not
|
||||
// return any filter
|
||||
class DefaultCompactionFilterFactory : public CompactionFilterFactory {
|
||||
public:
|
||||
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
|
||||
const CompactionFilter::Context& context) override {
|
||||
return std::unique_ptr<CompactionFilter>(nullptr);
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "DefaultCompactionFilterFactory";
|
||||
}
|
||||
};
|
||||
|
||||
// Each compaction will create a new CompactionFilterV2
|
||||
//
|
||||
// CompactionFilterFactoryV2 enables application to specify a prefix and use
|
||||
// CompactionFilterV2 to filter kv-pairs in batches. Each batch contains all
|
||||
// the kv-pairs sharing the same prefix.
|
||||
//
|
||||
// This is useful for applications that require grouping kv-pairs in
|
||||
// compaction filter to make a purge/no-purge decision. For example, if the
|
||||
// key prefix is user id and the rest of key represents the type of value.
|
||||
// This batching filter will come in handy if the application's compaction
|
||||
// filter requires knowledge of all types of values for any user id.
|
||||
//
|
||||
class CompactionFilterFactoryV2 {
|
||||
public:
|
||||
// NOTE: CompactionFilterFactoryV2 will not delete prefix_extractor
|
||||
explicit CompactionFilterFactoryV2(const SliceTransform* prefix_extractor)
|
||||
: prefix_extractor_(prefix_extractor) { }
|
||||
|
||||
virtual ~CompactionFilterFactoryV2() { }
|
||||
|
||||
virtual std::unique_ptr<CompactionFilterV2> CreateCompactionFilterV2(
|
||||
const CompactionFilterContext& context) = 0;
|
||||
|
||||
// Returns a name that identifies this compaction filter factory.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
const SliceTransform* GetPrefixExtractor() const {
|
||||
return prefix_extractor_;
|
||||
}
|
||||
|
||||
void SetPrefixExtractor(const SliceTransform* prefix_extractor) {
|
||||
prefix_extractor_ = prefix_extractor;
|
||||
}
|
||||
|
||||
private:
|
||||
// Prefix extractor for compaction filter v2
|
||||
// Keys sharing the same prefix will be buffered internally.
|
||||
// Client can implement a Filter callback function to operate on the buffer
|
||||
const SliceTransform* prefix_extractor_;
|
||||
};
|
||||
|
||||
// Default implementation of CompactionFilterFactoryV2 which does not
|
||||
// return any filter
|
||||
class DefaultCompactionFilterFactoryV2 : public CompactionFilterFactoryV2 {
|
||||
public:
|
||||
explicit DefaultCompactionFilterFactoryV2()
|
||||
: CompactionFilterFactoryV2(nullptr) { }
|
||||
|
||||
virtual std::unique_ptr<CompactionFilterV2>
|
||||
CreateCompactionFilterV2(
|
||||
const CompactionFilterContext& context) override {
|
||||
return std::unique_ptr<CompactionFilterV2>(nullptr);
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "DefaultCompactionFilterFactoryV2";
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
|
||||
67
include/rocksdb/comparator.h
Normal file
67
include/rocksdb/comparator.h
Normal file
@@ -0,0 +1,67 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
|
||||
// A Comparator object provides a total order across slices that are
|
||||
// used as keys in an sstable or a database. A Comparator implementation
|
||||
// must be thread-safe since rocksdb may invoke its methods concurrently
|
||||
// from multiple threads.
|
||||
class Comparator {
|
||||
public:
|
||||
virtual ~Comparator();
|
||||
|
||||
// Three-way comparison. Returns value:
|
||||
// < 0 iff "a" < "b",
|
||||
// == 0 iff "a" == "b",
|
||||
// > 0 iff "a" > "b"
|
||||
virtual int Compare(const Slice& a, const Slice& b) const = 0;
|
||||
|
||||
// The name of the comparator. Used to check for comparator
|
||||
// mismatches (i.e., a DB created with one comparator is
|
||||
// accessed using a different comparator.
|
||||
//
|
||||
// The client of this package should switch to a new name whenever
|
||||
// the comparator implementation changes in a way that will cause
|
||||
// the relative ordering of any two keys to change.
|
||||
//
|
||||
// Names starting with "rocksdb." are reserved and should not be used
|
||||
// by any clients of this package.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// Advanced functions: these are used to reduce the space requirements
|
||||
// for internal data structures like index blocks.
|
||||
|
||||
// If *start < limit, changes *start to a short string in [start,limit).
|
||||
// Simple comparator implementations may return with *start unchanged,
|
||||
// i.e., an implementation of this method that does nothing is correct.
|
||||
virtual void FindShortestSeparator(
|
||||
std::string* start,
|
||||
const Slice& limit) const = 0;
|
||||
|
||||
// Changes *key to a short string >= *key.
|
||||
// Simple comparator implementations may return with *key unchanged,
|
||||
// i.e., an implementation of this method that does nothing is correct.
|
||||
virtual void FindShortSuccessor(std::string* key) const = 0;
|
||||
};
|
||||
|
||||
// Return a builtin comparator that uses lexicographic byte-wise
|
||||
// ordering. The result remains the property of this module and
|
||||
// must not be deleted.
|
||||
extern const Comparator* BytewiseComparator();
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
|
||||
507
include/rocksdb/db.h
Normal file
507
include/rocksdb/db.h
Normal file
@@ -0,0 +1,507 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_DB_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include "rocksdb/version.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
class ColumnFamilyHandle {
|
||||
public:
|
||||
virtual ~ColumnFamilyHandle() {}
|
||||
};
|
||||
extern const std::string kDefaultColumnFamilyName;
|
||||
|
||||
struct ColumnFamilyDescriptor {
|
||||
std::string name;
|
||||
ColumnFamilyOptions options;
|
||||
ColumnFamilyDescriptor()
|
||||
: name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
|
||||
ColumnFamilyDescriptor(const std::string& _name,
|
||||
const ColumnFamilyOptions& _options)
|
||||
: name(_name), options(_options) {}
|
||||
};
|
||||
|
||||
static const int kMajorVersion = __ROCKSDB_MAJOR__;
|
||||
static const int kMinorVersion = __ROCKSDB_MINOR__;
|
||||
|
||||
struct Options;
|
||||
struct ReadOptions;
|
||||
struct WriteOptions;
|
||||
struct FlushOptions;
|
||||
struct TableProperties;
|
||||
class WriteBatch;
|
||||
class Env;
|
||||
|
||||
// Metadata associated with each SST file.
|
||||
struct LiveFileMetaData {
|
||||
std::string column_family_name; // Name of the column family
|
||||
std::string db_path;
|
||||
std::string name; // Name of the file
|
||||
int level; // Level at which this file resides.
|
||||
size_t size; // File size in bytes.
|
||||
std::string smallestkey; // Smallest user defined key in the file.
|
||||
std::string largestkey; // Largest user defined key in the file.
|
||||
SequenceNumber smallest_seqno; // smallest seqno in file
|
||||
SequenceNumber largest_seqno; // largest seqno in file
|
||||
};
|
||||
|
||||
// Abstract handle to particular state of a DB.
|
||||
// A Snapshot is an immutable object and can therefore be safely
|
||||
// accessed from multiple threads without any external synchronization.
|
||||
class Snapshot {
|
||||
protected:
|
||||
virtual ~Snapshot();
|
||||
};
|
||||
|
||||
// A range of keys
|
||||
struct Range {
|
||||
Slice start; // Included in the range
|
||||
Slice limit; // Not included in the range
|
||||
|
||||
Range() { }
|
||||
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
|
||||
};
|
||||
|
||||
// A collections of table properties objects, where
|
||||
// key: is the table's file name.
|
||||
// value: the table properties object of the given table.
|
||||
typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
|
||||
TablePropertiesCollection;
|
||||
|
||||
// A DB is a persistent ordered map from keys to values.
|
||||
// A DB is safe for concurrent access from multiple threads without
|
||||
// any external synchronization.
|
||||
class DB {
|
||||
public:
|
||||
// Open the database with the specified "name".
|
||||
// Stores a pointer to a heap-allocated database in *dbptr and returns
|
||||
// OK on success.
|
||||
// Stores nullptr in *dbptr and returns a non-OK status on error.
|
||||
// Caller should delete *dbptr when it is no longer needed.
|
||||
static Status Open(const Options& options,
|
||||
const std::string& name,
|
||||
DB** dbptr);
|
||||
|
||||
// Open the database for read only. All DB interfaces
|
||||
// that modify data, like put/delete, will return error.
|
||||
// If the db is opened in read only mode, then no compactions
|
||||
// will happen.
|
||||
static Status OpenForReadOnly(const Options& options,
|
||||
const std::string& name, DB** dbptr,
|
||||
bool error_if_log_file_exist = false);
|
||||
|
||||
// Open the database for read only with column families. When opening DB with
|
||||
// read only, you can specify only a subset of column families in the
|
||||
// database that should be opened. However, you always need to specify default
|
||||
// column family. The default column family name is 'default' and it's stored
|
||||
// in rocksdb::kDefaultColumnFamilyName
|
||||
static Status OpenForReadOnly(
|
||||
const DBOptions& db_options, const std::string& name,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
||||
bool error_if_log_file_exist = false);
|
||||
|
||||
// Open DB with column families.
|
||||
// db_options specify database specific options
|
||||
// column_families is the vector of all column families in the databse,
|
||||
// containing column family name and options. You need to open ALL column
|
||||
// families in the database. To get the list of column families, you can use
|
||||
// ListColumnFamilies(). Also, you can open only a subset of column families
|
||||
// for read-only access.
|
||||
// The default column family name is 'default' and it's stored
|
||||
// in rocksdb::kDefaultColumnFamilyName.
|
||||
// If everything is OK, handles will on return be the same size
|
||||
// as column_families --- handles[i] will be a handle that you
|
||||
// will use to operate on column family column_family[i]
|
||||
static Status Open(const DBOptions& db_options, const std::string& name,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
|
||||
|
||||
// ListColumnFamilies will open the DB specified by argument name
|
||||
// and return the list of all column families in that DB
|
||||
// through column_families argument. The ordering of
|
||||
// column families in column_families is unspecified.
|
||||
static Status ListColumnFamilies(const DBOptions& db_options,
|
||||
const std::string& name,
|
||||
std::vector<std::string>* column_families);
|
||||
|
||||
DB() { }
|
||||
virtual ~DB();
|
||||
|
||||
// Create a column_family and return the handle of column family
|
||||
// through the argument handle.
|
||||
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
|
||||
const std::string& column_family_name,
|
||||
ColumnFamilyHandle** handle);
|
||||
|
||||
// Drop a column family specified by column_family handle. This call
|
||||
// only records a drop record in the manifest and prevents the column
|
||||
// family from flushing and compacting.
|
||||
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
|
||||
|
||||
// Set the database entry for "key" to "value".
|
||||
// If "key" already exists, it will be overwritten.
|
||||
// Returns OK on success, and a non-OK status on error.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
virtual Status Put(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value) {
|
||||
return Put(options, DefaultColumnFamily(), key, value);
|
||||
}
|
||||
|
||||
// Remove the database entry (if any) for "key". Returns OK on
|
||||
// success, and a non-OK status on error. It is not an error if "key"
|
||||
// did not exist in the database.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Delete(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family,
|
||||
const Slice& key) = 0;
|
||||
virtual Status Delete(const WriteOptions& options, const Slice& key) {
|
||||
return Delete(options, DefaultColumnFamily(), key);
|
||||
}
|
||||
|
||||
// Merge the database entry for "key" with "value". Returns OK on success,
|
||||
// and a non-OK status on error. The semantics of this operation is
|
||||
// determined by the user provided merge_operator when opening DB.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
virtual Status Merge(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value) {
|
||||
return Merge(options, DefaultColumnFamily(), key, value);
|
||||
}
|
||||
|
||||
// Apply the specified updates to the database.
|
||||
// Returns OK on success, non-OK on failure.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
|
||||
|
||||
// If the database contains an entry for "key" store the
|
||||
// corresponding value in *value and return OK.
|
||||
//
|
||||
// If there is no entry for "key" leave *value unchanged and return
|
||||
// a status for which Status::IsNotFound() returns true.
|
||||
//
|
||||
// May return some other Status on an error.
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value) = 0;
|
||||
virtual Status Get(const ReadOptions& options, const Slice& key, std::string* value) {
|
||||
return Get(options, DefaultColumnFamily(), key, value);
|
||||
}
|
||||
|
||||
// If keys[i] does not exist in the database, then the i'th returned
|
||||
// status will be one for which Status::IsNotFound() is true, and
|
||||
// (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
|
||||
// the i'th returned status will have Status::ok() true, and (*values)[i]
|
||||
// will store the value associated with keys[i].
|
||||
//
|
||||
// (*values) will always be resized to be the same size as (keys).
|
||||
// Similarly, the number of returned statuses will be the number of keys.
|
||||
// Note: keys will not be "de-duplicated". Duplicate keys will return
|
||||
// duplicate values in order.
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
|
||||
virtual std::vector<Status> MultiGet(const ReadOptions& options,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) {
|
||||
return MultiGet(options, std::vector<ColumnFamilyHandle*>(
|
||||
keys.size(), DefaultColumnFamily()),
|
||||
keys, values);
|
||||
}
|
||||
|
||||
// If the key definitely does not exist in the database, then this method
|
||||
// returns false, else true. If the caller wants to obtain value when the key
|
||||
// is found in memory, a bool for 'value_found' must be passed. 'value_found'
|
||||
// will be true on return if value has been set properly.
|
||||
// This check is potentially lighter-weight than invoking DB::Get(). One way
|
||||
// to make this lighter weight is to avoid doing any IOs.
|
||||
// Default implementation here returns true and sets 'value_found' to false
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value, bool* value_found = nullptr) {
|
||||
if (value_found != nullptr) {
|
||||
*value_found = false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
|
||||
std::string* value, bool* value_found = nullptr) {
|
||||
return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
|
||||
}
|
||||
|
||||
// Return a heap-allocated iterator over the contents of the database.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
// call one of the Seek methods on the iterator before using it).
|
||||
//
|
||||
// Caller should delete the iterator when it is no longer needed.
|
||||
// The returned iterator should be deleted before this db is deleted.
|
||||
virtual Iterator* NewIterator(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family) = 0;
|
||||
virtual Iterator* NewIterator(const ReadOptions& options) {
|
||||
return NewIterator(options, DefaultColumnFamily());
|
||||
}
|
||||
// Returns iterators from a consistent database state across multiple
|
||||
// column families. Iterators are heap allocated and need to be deleted
|
||||
// before the db is deleted
|
||||
virtual Status NewIterators(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||
std::vector<Iterator*>* iterators) = 0;
|
||||
|
||||
// Return a handle to the current DB state. Iterators created with
|
||||
// this handle will all observe a stable snapshot of the current DB
|
||||
// state. The caller must call ReleaseSnapshot(result) when the
|
||||
// snapshot is no longer needed.
|
||||
//
|
||||
// nullptr will be returned if the DB fails to take a snapshot or does
|
||||
// not support snapshot.
|
||||
virtual const Snapshot* GetSnapshot() = 0;
|
||||
|
||||
// Release a previously acquired snapshot. The caller must not
|
||||
// use "snapshot" after this call.
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
|
||||
|
||||
// DB implementations can export properties about their state
|
||||
// via this method. If "property" is a valid property understood by this
|
||||
// DB implementation, fills "*value" with its current value and returns
|
||||
// true. Otherwise returns false.
|
||||
//
|
||||
//
|
||||
// Valid property names include:
|
||||
//
|
||||
// "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
|
||||
// where <N> is an ASCII representation of a level number (e.g. "0").
|
||||
// "rocksdb.stats" - returns a multi-line string that describes statistics
|
||||
// about the internal operation of the DB.
|
||||
// "rocksdb.sstables" - returns a multi-line string that describes all
|
||||
// of the sstables that make up the db contents.
|
||||
virtual bool GetProperty(ColumnFamilyHandle* column_family,
|
||||
const Slice& property, std::string* value) = 0;
|
||||
virtual bool GetProperty(const Slice& property, std::string* value) {
|
||||
return GetProperty(DefaultColumnFamily(), property, value);
|
||||
}
|
||||
|
||||
// Similar to GetProperty(), but only works for a subset of properties whose
|
||||
// return value is an integer. Return the value by integer.
|
||||
virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
|
||||
const Slice& property, uint64_t* value) = 0;
|
||||
virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
|
||||
return GetIntProperty(DefaultColumnFamily(), property, value);
|
||||
}
|
||||
|
||||
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
||||
// file system space used by keys in "[range[i].start .. range[i].limit)".
|
||||
//
|
||||
// Note that the returned sizes measure file system space usage, so
|
||||
// if the user data compresses by a factor of ten, the returned
|
||||
// sizes will be one-tenth the size of the corresponding user data size.
|
||||
//
|
||||
// The results may not include the sizes of recently written data.
|
||||
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
||||
const Range* range, int n,
|
||||
uint64_t* sizes) = 0;
|
||||
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) {
|
||||
GetApproximateSizes(DefaultColumnFamily(), range, n, sizes);
|
||||
}
|
||||
|
||||
// Compact the underlying storage for the key range [*begin,*end].
|
||||
// The actual compaction interval might be superset of [*begin, *end].
|
||||
// In particular, deleted and overwritten versions are discarded,
|
||||
// and the data is rearranged to reduce the cost of operations
|
||||
// needed to access the data. This operation should typically only
|
||||
// be invoked by users who understand the underlying implementation.
|
||||
//
|
||||
// begin==nullptr is treated as a key before all keys in the database.
|
||||
// end==nullptr is treated as a key after all keys in the database.
|
||||
// Therefore the following call will compact the entire database:
|
||||
// db->CompactRange(nullptr, nullptr);
|
||||
// Note that after the entire database is compacted, all data are pushed
|
||||
// down to the last level containing any data. If the total data size
|
||||
// after compaction is reduced, that level might not be appropriate for
|
||||
// hosting all the files. In this case, client could set reduce_level
|
||||
// to true, to move the files back to the minimum level capable of holding
|
||||
// the data set or a given level (specified by non-negative target_level).
|
||||
// Compaction outputs should be placed in options.db_paths[target_path_id].
|
||||
// Behavior is undefined if target_path_id is out of range.
|
||||
virtual Status CompactRange(ColumnFamilyHandle* column_family,
|
||||
const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false, int target_level = -1,
|
||||
uint32_t target_path_id = 0) = 0;
|
||||
virtual Status CompactRange(const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false, int target_level = -1,
|
||||
uint32_t target_path_id = 0) {
|
||||
return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
|
||||
target_level, target_path_id);
|
||||
}
|
||||
|
||||
// Number of levels used for this DB.
|
||||
virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
|
||||
virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
|
||||
|
||||
// Maximum level to which a new compacted memtable is pushed if it
|
||||
// does not create overlap.
|
||||
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
|
||||
virtual int MaxMemCompactionLevel() {
|
||||
return MaxMemCompactionLevel(DefaultColumnFamily());
|
||||
}
|
||||
|
||||
// Number of files in level-0 that would stop writes.
|
||||
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
|
||||
virtual int Level0StopWriteTrigger() {
|
||||
return Level0StopWriteTrigger(DefaultColumnFamily());
|
||||
}
|
||||
|
||||
// Get DB name -- the exact same name that was provided as an argument to
|
||||
// DB::Open()
|
||||
virtual const std::string& GetName() const = 0;
|
||||
|
||||
// Get Env object from the DB
|
||||
virtual Env* GetEnv() const = 0;
|
||||
|
||||
// Get DB Options that we use
|
||||
virtual const Options& GetOptions(ColumnFamilyHandle* column_family)
|
||||
const = 0;
|
||||
virtual const Options& GetOptions() const {
|
||||
return GetOptions(DefaultColumnFamily());
|
||||
}
|
||||
|
||||
// Flush all mem-table data.
|
||||
virtual Status Flush(const FlushOptions& options,
|
||||
ColumnFamilyHandle* column_family) = 0;
|
||||
virtual Status Flush(const FlushOptions& options) {
|
||||
return Flush(options, DefaultColumnFamily());
|
||||
}
|
||||
|
||||
// The sequence number of the most recent transaction.
|
||||
virtual SequenceNumber GetLatestSequenceNumber() const = 0;
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
// Prevent file deletions. Compactions will continue to occur,
|
||||
// but no obsolete files will be deleted. Calling this multiple
|
||||
// times have the same effect as calling it once.
|
||||
virtual Status DisableFileDeletions() = 0;
|
||||
|
||||
// Allow compactions to delete obsolete files.
|
||||
// If force == true, the call to EnableFileDeletions() will guarantee that
|
||||
// file deletions are enabled after the call, even if DisableFileDeletions()
|
||||
// was called multiple times before.
|
||||
// If force == false, EnableFileDeletions will only enable file deletion
|
||||
// after it's been called at least as many times as DisableFileDeletions(),
|
||||
// enabling the two methods to be called by two threads concurrently without
|
||||
// synchronization -- i.e., file deletions will be enabled only after both
|
||||
// threads call EnableFileDeletions()
|
||||
virtual Status EnableFileDeletions(bool force = true) = 0;
|
||||
|
||||
// GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
|
||||
|
||||
// THIS METHOD IS DEPRECATED. Use the GetLiveFilesMetaData to get more
|
||||
// detailed information on the live files.
|
||||
// Retrieve the list of all files in the database. The files are
|
||||
// relative to the dbname and are not absolute paths. The valid size of the
|
||||
// manifest file is returned in manifest_file_size. The manifest file is an
|
||||
// ever growing file, but only the portion specified by manifest_file_size is
|
||||
// valid for this snapshot.
|
||||
// Setting flush_memtable to true does Flush before recording the live files.
|
||||
// Setting flush_memtable to false is useful when we don't want to wait for
|
||||
// flush which may have to wait for compaction to complete taking an
|
||||
// indeterminate time.
|
||||
//
|
||||
// In case you have multiple column families, even if flush_memtable is true,
|
||||
// you still need to call GetSortedWalFiles after GetLiveFiles to compensate
|
||||
// for new data that arrived to already-flushed column families while other
|
||||
// column families were flushing
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true) = 0;
|
||||
|
||||
// Retrieve the sorted list of all wal files with earliest file first
|
||||
virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
|
||||
|
||||
// Sets iter to an iterator that is positioned at a write-batch containing
|
||||
// seq_number. If the sequence number is non existent, it returns an iterator
|
||||
// at the first available seq_no after the requested seq_no
|
||||
// Returns Status::OK if iterator is valid
|
||||
// Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
|
||||
// use this api, else the WAL files will get
|
||||
// cleared aggressively and the iterator might keep getting invalid before
|
||||
// an update is read.
|
||||
virtual Status GetUpdatesSince(
|
||||
SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
|
||||
const TransactionLogIterator::ReadOptions&
|
||||
read_options = TransactionLogIterator::ReadOptions()) = 0;
|
||||
|
||||
// Delete the file name from the db directory and update the internal state to
|
||||
// reflect that. Supports deletion of sst and log files only. 'name' must be
|
||||
// path relative to the db directory. eg. 000001.sst, /archive/000003.log
|
||||
virtual Status DeleteFile(std::string name) = 0;
|
||||
|
||||
// Returns a list of all table files with their level, start key
|
||||
// and end key
|
||||
virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
// Sets the globally unique ID created at database creation time by invoking
|
||||
// Env::GenerateUniqueId(), in identity. Returns Status::OK if identity could
|
||||
// be set properly
|
||||
virtual Status GetDbIdentity(std::string& identity) = 0;
|
||||
|
||||
// Returns default column family handle
|
||||
virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
||||
TablePropertiesCollection* props) = 0;
|
||||
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
|
||||
return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
|
||||
}
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
DB(const DB&);
|
||||
void operator=(const DB&);
|
||||
};
|
||||
|
||||
// Destroy the contents of the specified database.
|
||||
// Be very careful using this method.
|
||||
Status DestroyDB(const std::string& name, const Options& options);
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
// If a DB cannot be opened, you may attempt to call this method to
|
||||
// resurrect as much of the contents of the database as possible.
|
||||
// Some data may be lost, so be careful when calling this function
|
||||
// on a database that contains important information.
|
||||
Status RepairDB(const std::string& dbname, const Options& options);
|
||||
#endif
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_DB_H_
|
||||
798
include/rocksdb/env.h
Normal file
798
include/rocksdb/env.h
Normal file
@@ -0,0 +1,798 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// An Env is an interface used by the rocksdb implementation to access
|
||||
// operating system functionality like the filesystem etc. Callers
|
||||
// may wish to provide a custom Env object when opening a database to
|
||||
// get fine gain control; e.g., to rate limit file system operations.
|
||||
//
|
||||
// All Env implementations are safe for concurrent access from
|
||||
// multiple threads without any external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
|
||||
|
||||
#include <cstdarg>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class FileLock;
|
||||
class Logger;
|
||||
class RandomAccessFile;
|
||||
class SequentialFile;
|
||||
class Slice;
|
||||
class WritableFile;
|
||||
class RandomRWFile;
|
||||
class Directory;
|
||||
struct DBOptions;
|
||||
class RateLimiter;
|
||||
|
||||
using std::unique_ptr;
|
||||
using std::shared_ptr;
|
||||
|
||||
|
||||
// Options while opening a file to read/write
|
||||
struct EnvOptions {
|
||||
|
||||
// construct with default Options
|
||||
EnvOptions();
|
||||
|
||||
// construct from Options
|
||||
explicit EnvOptions(const DBOptions& options);
|
||||
|
||||
// If true, then allow caching of data in environment buffers
|
||||
bool use_os_buffer = true;
|
||||
|
||||
// If true, then use mmap to read data
|
||||
bool use_mmap_reads = false;
|
||||
|
||||
// If true, then use mmap to write data
|
||||
bool use_mmap_writes = true;
|
||||
|
||||
// If true, set the FD_CLOEXEC on open fd.
|
||||
bool set_fd_cloexec = true;
|
||||
|
||||
// Allows OS to incrementally sync files to disk while they are being
|
||||
// written, in the background. Issue one request for every bytes_per_sync
|
||||
// written. 0 turns it off.
|
||||
// Default: 0
|
||||
uint64_t bytes_per_sync = 0;
|
||||
|
||||
// If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
|
||||
// means that file size won't change as part of preallocation.
|
||||
// If false, preallocation will also change the file size. This option will
|
||||
// improve the performance in workloads where you sync the data on every
|
||||
// write. By default, we set it to true for MANIFEST writes and false for
|
||||
// WAL writes
|
||||
bool fallocate_with_keep_size = true;
|
||||
|
||||
// If not nullptr, write rate limiting is enabled for flush and compaction
|
||||
RateLimiter* rate_limiter = nullptr;
|
||||
};
|
||||
|
||||
class Env {
|
||||
public:
|
||||
Env() { }
|
||||
virtual ~Env();
|
||||
|
||||
// Return a default environment suitable for the current operating
|
||||
// system. Sophisticated users may wish to provide their own Env
|
||||
// implementation instead of relying on this default environment.
|
||||
//
|
||||
// The result of Default() belongs to rocksdb and must never be deleted.
|
||||
static Env* Default();
|
||||
|
||||
// Create a brand new sequentially-readable file with the specified name.
|
||||
// On success, stores a pointer to the new file in *result and returns OK.
|
||||
// On failure stores nullptr in *result and returns non-OK. If the file does
|
||||
// not exist, returns a non-OK status.
|
||||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
virtual Status NewSequentialFile(const std::string& fname,
|
||||
unique_ptr<SequentialFile>* result,
|
||||
const EnvOptions& options)
|
||||
= 0;
|
||||
|
||||
// Create a brand new random access read-only file with the
|
||||
// specified name. On success, stores a pointer to the new file in
|
||||
// *result and returns OK. On failure stores nullptr in *result and
|
||||
// returns non-OK. If the file does not exist, returns a non-OK
|
||||
// status.
|
||||
//
|
||||
// The returned file may be concurrently accessed by multiple threads.
|
||||
virtual Status NewRandomAccessFile(const std::string& fname,
|
||||
unique_ptr<RandomAccessFile>* result,
|
||||
const EnvOptions& options)
|
||||
= 0;
|
||||
|
||||
// Create an object that writes to a new file with the specified
|
||||
// name. Deletes any existing file with the same name and creates a
|
||||
// new file. On success, stores a pointer to the new file in
|
||||
// *result and returns OK. On failure stores nullptr in *result and
|
||||
// returns non-OK.
|
||||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
virtual Status NewWritableFile(const std::string& fname,
|
||||
unique_ptr<WritableFile>* result,
|
||||
const EnvOptions& options) = 0;
|
||||
|
||||
// Create an object that both reads and writes to a file on
|
||||
// specified offsets (random access). If file already exists,
|
||||
// does not overwrite it. On success, stores a pointer to the
|
||||
// new file in *result and returns OK. On failure stores nullptr
|
||||
// in *result and returns non-OK.
|
||||
virtual Status NewRandomRWFile(const std::string& fname,
|
||||
unique_ptr<RandomRWFile>* result,
|
||||
const EnvOptions& options) = 0;
|
||||
|
||||
// Create an object that represents a directory. Will fail if directory
|
||||
// doesn't exist. If the directory exists, it will open the directory
|
||||
// and create a new Directory object.
|
||||
//
|
||||
// On success, stores a pointer to the new Directory in
|
||||
// *result and returns OK. On failure stores nullptr in *result and
|
||||
// returns non-OK.
|
||||
virtual Status NewDirectory(const std::string& name,
|
||||
unique_ptr<Directory>* result) = 0;
|
||||
|
||||
// Returns true iff the named file exists.
|
||||
virtual bool FileExists(const std::string& fname) = 0;
|
||||
|
||||
// Store in *result the names of the children of the specified directory.
|
||||
// The names are relative to "dir".
|
||||
// Original contents of *results are dropped.
|
||||
virtual Status GetChildren(const std::string& dir,
|
||||
std::vector<std::string>* result) = 0;
|
||||
|
||||
// Delete the named file.
|
||||
virtual Status DeleteFile(const std::string& fname) = 0;
|
||||
|
||||
// Create the specified directory. Returns error if directory exists.
|
||||
virtual Status CreateDir(const std::string& dirname) = 0;
|
||||
|
||||
// Creates directory if missing. Return Ok if it exists, or successful in
|
||||
// Creating.
|
||||
virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
|
||||
|
||||
// Delete the specified directory.
|
||||
virtual Status DeleteDir(const std::string& dirname) = 0;
|
||||
|
||||
// Store the size of fname in *file_size.
|
||||
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
|
||||
|
||||
// Store the last modification time of fname in *file_mtime.
|
||||
virtual Status GetFileModificationTime(const std::string& fname,
|
||||
uint64_t* file_mtime) = 0;
|
||||
// Rename file src to target.
|
||||
virtual Status RenameFile(const std::string& src,
|
||||
const std::string& target) = 0;
|
||||
|
||||
// Lock the specified file. Used to prevent concurrent access to
|
||||
// the same db by multiple processes. On failure, stores nullptr in
|
||||
// *lock and returns non-OK.
|
||||
//
|
||||
// On success, stores a pointer to the object that represents the
|
||||
// acquired lock in *lock and returns OK. The caller should call
|
||||
// UnlockFile(*lock) to release the lock. If the process exits,
|
||||
// the lock will be automatically released.
|
||||
//
|
||||
// If somebody else already holds the lock, finishes immediately
|
||||
// with a failure. I.e., this call does not wait for existing locks
|
||||
// to go away.
|
||||
//
|
||||
// May create the named file if it does not already exist.
|
||||
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
|
||||
|
||||
// Release the lock acquired by a previous successful call to LockFile.
|
||||
// REQUIRES: lock was returned by a successful LockFile() call
|
||||
// REQUIRES: lock has not already been unlocked.
|
||||
virtual Status UnlockFile(FileLock* lock) = 0;
|
||||
|
||||
// Priority for scheduling job in thread pool
|
||||
enum Priority { LOW, HIGH, TOTAL };
|
||||
|
||||
// Priority for scheduling job in thread pool
|
||||
enum IOPriority {
|
||||
IO_LOW = 0,
|
||||
IO_HIGH = 1,
|
||||
IO_TOTAL = 2
|
||||
};
|
||||
|
||||
// Arrange to run "(*function)(arg)" once in a background thread, in
|
||||
// the thread pool specified by pri. By default, jobs go to the 'LOW'
|
||||
// priority thread pool.
|
||||
|
||||
// "function" may run in an unspecified thread. Multiple functions
|
||||
// added to the same Env may run concurrently in different threads.
|
||||
// I.e., the caller may not assume that background work items are
|
||||
// serialized.
|
||||
virtual void Schedule(
|
||||
void (*function)(void* arg),
|
||||
void* arg,
|
||||
Priority pri = LOW) = 0;
|
||||
|
||||
// Start a new thread, invoking "function(arg)" within the new thread.
|
||||
// When "function(arg)" returns, the thread will be destroyed.
|
||||
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
|
||||
|
||||
// Wait for all threads started by StartThread to terminate.
|
||||
virtual void WaitForJoin() {}
|
||||
|
||||
// Get thread pool queue length for specific thrad pool.
|
||||
virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// *path is set to a temporary directory that can be used for testing. It may
|
||||
// or many not have just been created. The directory may or may not differ
|
||||
// between runs of the same process, but subsequent calls will return the
|
||||
// same directory.
|
||||
virtual Status GetTestDirectory(std::string* path) = 0;
|
||||
|
||||
// Create and return a log file for storing informational messages.
|
||||
virtual Status NewLogger(const std::string& fname,
|
||||
shared_ptr<Logger>* result) = 0;
|
||||
|
||||
// Returns the number of micro-seconds since some fixed point in time. Only
|
||||
// useful for computing deltas of time.
|
||||
virtual uint64_t NowMicros() = 0;
|
||||
|
||||
// Returns the number of nano-seconds since some fixed point in time. Only
|
||||
// useful for computing deltas of time in one run.
|
||||
// Default implementation simply relies on NowMicros
|
||||
virtual uint64_t NowNanos() {
|
||||
return NowMicros() * 1000;
|
||||
}
|
||||
|
||||
// Sleep/delay the thread for the perscribed number of micro-seconds.
|
||||
virtual void SleepForMicroseconds(int micros) = 0;
|
||||
|
||||
// Get the current host name.
|
||||
virtual Status GetHostName(char* name, uint64_t len) = 0;
|
||||
|
||||
// Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
|
||||
virtual Status GetCurrentTime(int64_t* unix_time) = 0;
|
||||
|
||||
// Get full directory name for this db.
|
||||
virtual Status GetAbsolutePath(const std::string& db_path,
|
||||
std::string* output_path) = 0;
|
||||
|
||||
// The number of background worker threads of a specific thread pool
|
||||
// for this environment. 'LOW' is the default pool.
|
||||
// default number: 1
|
||||
virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
|
||||
|
||||
// Converts seconds-since-Jan-01-1970 to a printable string
|
||||
virtual std::string TimeToString(uint64_t time) = 0;
|
||||
|
||||
// Generates a unique id that can be used to identify a db
|
||||
virtual std::string GenerateUniqueId();
|
||||
|
||||
// OptimizeForLogWrite will create a new EnvOptions object that is a copy of
|
||||
// the EnvOptions in the parameters, but is optimized for writing log files.
|
||||
// Default implementation returns the copy of the same object.
|
||||
virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const;
|
||||
// OptimizeForManifestWrite will create a new EnvOptions object that is a copy
|
||||
// of the EnvOptions in the parameters, but is optimized for writing manifest
|
||||
// files. Default implementation returns the copy of the same object.
|
||||
virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options)
|
||||
const;
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Env(const Env&);
|
||||
void operator=(const Env&);
|
||||
};
|
||||
|
||||
// A file abstraction for reading sequentially through a file
|
||||
class SequentialFile {
|
||||
public:
|
||||
SequentialFile() { }
|
||||
virtual ~SequentialFile();
|
||||
|
||||
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
|
||||
// written by this routine. Sets "*result" to the data that was
|
||||
// read (including if fewer than "n" bytes were successfully read).
|
||||
// May set "*result" to point at data in "scratch[0..n-1]", so
|
||||
// "scratch[0..n-1]" must be live when "*result" is used.
|
||||
// If an error was encountered, returns a non-OK status.
|
||||
//
|
||||
// REQUIRES: External synchronization
|
||||
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
|
||||
|
||||
// Skip "n" bytes from the file. This is guaranteed to be no
|
||||
// slower that reading the same data, but may be faster.
|
||||
//
|
||||
// If end of file is reached, skipping will stop at the end of the
|
||||
// file, and Skip will return OK.
|
||||
//
|
||||
// REQUIRES: External synchronization
|
||||
virtual Status Skip(uint64_t n) = 0;
|
||||
|
||||
// Remove any kind of caching of data from the offset to offset+length
|
||||
// of this file. If the length is 0, then it refers to the end of file.
|
||||
// If the system is not caching the file contents, then this is a noop.
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) {
|
||||
return Status::NotSupported("InvalidateCache not supported.");
|
||||
}
|
||||
};
|
||||
|
||||
// A file abstraction for randomly reading the contents of a file.
|
||||
class RandomAccessFile {
|
||||
public:
|
||||
RandomAccessFile() { }
|
||||
virtual ~RandomAccessFile();
|
||||
|
||||
// Read up to "n" bytes from the file starting at "offset".
|
||||
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
|
||||
// to the data that was read (including if fewer than "n" bytes were
|
||||
// successfully read). May set "*result" to point at data in
|
||||
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
|
||||
// "*result" is used. If an error was encountered, returns a non-OK
|
||||
// status.
|
||||
//
|
||||
// Safe for concurrent use by multiple threads.
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const = 0;
|
||||
|
||||
// Tries to get an unique ID for this file that will be the same each time
|
||||
// the file is opened (and will stay the same while the file is open).
|
||||
// Furthermore, it tries to make this ID at most "max_size" bytes. If such an
|
||||
// ID can be created this function returns the length of the ID and places it
|
||||
// in "id"; otherwise, this function returns 0, in which case "id"
|
||||
// may not have been modified.
|
||||
//
|
||||
// This function guarantees, for IDs from a given environment, two unique ids
|
||||
// cannot be made equal to eachother by adding arbitrary bytes to one of
|
||||
// them. That is, no unique ID is the prefix of another.
|
||||
//
|
||||
// This function guarantees that the returned ID will not be interpretable as
|
||||
// a single varint.
|
||||
//
|
||||
// Note: these IDs are only valid for the duration of the process.
|
||||
virtual size_t GetUniqueId(char* id, size_t max_size) const {
|
||||
return 0; // Default implementation to prevent issues with backwards
|
||||
// compatibility.
|
||||
};
|
||||
|
||||
|
||||
enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
|
||||
|
||||
virtual void Hint(AccessPattern pattern) {}
|
||||
|
||||
// Remove any kind of caching of data from the offset to offset+length
|
||||
// of this file. If the length is 0, then it refers to the end of file.
|
||||
// If the system is not caching the file contents, then this is a noop.
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) {
|
||||
return Status::NotSupported("InvalidateCache not supported.");
|
||||
}
|
||||
};
|
||||
|
||||
// A file abstraction for sequential writing. The implementation
|
||||
// must provide buffering since callers may append small fragments
|
||||
// at a time to the file.
|
||||
class WritableFile {
|
||||
public:
|
||||
WritableFile()
|
||||
: last_preallocated_block_(0),
|
||||
preallocation_block_size_(0),
|
||||
io_priority_(Env::IO_TOTAL) {
|
||||
}
|
||||
virtual ~WritableFile();
|
||||
|
||||
virtual Status Append(const Slice& data) = 0;
|
||||
virtual Status Close() = 0;
|
||||
virtual Status Flush() = 0;
|
||||
virtual Status Sync() = 0; // sync data
|
||||
|
||||
/*
|
||||
* Sync data and/or metadata as well.
|
||||
* By default, sync only data.
|
||||
* Override this method for environments where we need to sync
|
||||
* metadata as well.
|
||||
*/
|
||||
virtual Status Fsync() {
|
||||
return Sync();
|
||||
}
|
||||
|
||||
/*
|
||||
* Change the priority in rate limiter if rate limiting is enabled.
|
||||
* If rate limiting is not enabled, this call has no effect.
|
||||
*/
|
||||
virtual void SetIOPriority(Env::IOPriority pri) {
|
||||
io_priority_ = pri;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the size of valid data in the file.
|
||||
*/
|
||||
virtual uint64_t GetFileSize() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get and set the default pre-allocation block size for writes to
|
||||
* this file. If non-zero, then Allocate will be used to extend the
|
||||
* underlying storage of a file (generally via fallocate) if the Env
|
||||
* instance supports it.
|
||||
*/
|
||||
void SetPreallocationBlockSize(size_t size) {
|
||||
preallocation_block_size_ = size;
|
||||
}
|
||||
|
||||
virtual void GetPreallocationStatus(size_t* block_size,
|
||||
size_t* last_allocated_block) {
|
||||
*last_allocated_block = last_preallocated_block_;
|
||||
*block_size = preallocation_block_size_;
|
||||
}
|
||||
|
||||
// For documentation, refer to RandomAccessFile::GetUniqueId()
|
||||
virtual size_t GetUniqueId(char* id, size_t max_size) const {
|
||||
return 0; // Default implementation to prevent issues with backwards
|
||||
}
|
||||
|
||||
// Remove any kind of caching of data from the offset to offset+length
|
||||
// of this file. If the length is 0, then it refers to the end of file.
|
||||
// If the system is not caching the file contents, then this is a noop.
|
||||
// This call has no effect on dirty pages in the cache.
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) {
|
||||
return Status::NotSupported("InvalidateCache not supported.");
|
||||
}
|
||||
|
||||
protected:
|
||||
// PrepareWrite performs any necessary preparation for a write
|
||||
// before the write actually occurs. This allows for pre-allocation
|
||||
// of space on devices where it can result in less file
|
||||
// fragmentation and/or less waste from over-zealous filesystem
|
||||
// pre-allocation.
|
||||
void PrepareWrite(size_t offset, size_t len) {
|
||||
if (preallocation_block_size_ == 0) {
|
||||
return;
|
||||
}
|
||||
// If this write would cross one or more preallocation blocks,
|
||||
// determine what the last preallocation block necesessary to
|
||||
// cover this write would be and Allocate to that point.
|
||||
const auto block_size = preallocation_block_size_;
|
||||
size_t new_last_preallocated_block =
|
||||
(offset + len + block_size - 1) / block_size;
|
||||
if (new_last_preallocated_block > last_preallocated_block_) {
|
||||
size_t num_spanned_blocks =
|
||||
new_last_preallocated_block - last_preallocated_block_;
|
||||
Allocate(block_size * last_preallocated_block_,
|
||||
block_size * num_spanned_blocks);
|
||||
last_preallocated_block_ = new_last_preallocated_block;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Pre-allocate space for a file.
|
||||
*/
|
||||
virtual Status Allocate(off_t offset, off_t len) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Sync a file range with disk.
|
||||
// offset is the starting byte of the file range to be synchronized.
|
||||
// nbytes specifies the length of the range to be synchronized.
|
||||
// This asks the OS to initiate flushing the cached data to disk,
|
||||
// without waiting for completion.
|
||||
// Default implementation does nothing.
|
||||
virtual Status RangeSync(off_t offset, off_t nbytes) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
size_t last_preallocated_block_;
|
||||
size_t preallocation_block_size_;
|
||||
// No copying allowed
|
||||
WritableFile(const WritableFile&);
|
||||
void operator=(const WritableFile&);
|
||||
|
||||
protected:
|
||||
Env::IOPriority io_priority_;
|
||||
};
|
||||
|
||||
// A file abstraction for random reading and writing.
|
||||
class RandomRWFile {
|
||||
public:
|
||||
RandomRWFile() {}
|
||||
virtual ~RandomRWFile() {}
|
||||
|
||||
// Write data from Slice data to file starting from offset
|
||||
// Returns IOError on failure, but does not guarantee
|
||||
// atomicity of a write. Returns OK status on success.
|
||||
//
|
||||
// Safe for concurrent use.
|
||||
virtual Status Write(uint64_t offset, const Slice& data) = 0;
|
||||
// Read up to "n" bytes from the file starting at "offset".
|
||||
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
|
||||
// to the data that was read (including if fewer than "n" bytes were
|
||||
// successfully read). May set "*result" to point at data in
|
||||
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
|
||||
// "*result" is used. If an error was encountered, returns a non-OK
|
||||
// status.
|
||||
//
|
||||
// Safe for concurrent use by multiple threads.
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const = 0;
|
||||
virtual Status Close() = 0; // closes the file
|
||||
virtual Status Sync() = 0; // sync data
|
||||
|
||||
/*
|
||||
* Sync data and/or metadata as well.
|
||||
* By default, sync only data.
|
||||
* Override this method for environments where we need to sync
|
||||
* metadata as well.
|
||||
*/
|
||||
virtual Status Fsync() {
|
||||
return Sync();
|
||||
}
|
||||
|
||||
/*
|
||||
* Pre-allocate space for a file.
|
||||
*/
|
||||
virtual Status Allocate(off_t offset, off_t len) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
RandomRWFile(const RandomRWFile&);
|
||||
void operator=(const RandomRWFile&);
|
||||
};
|
||||
|
||||
// Directory object represents collection of files and implements
|
||||
// filesystem operations that can be executed on directories.
|
||||
class Directory {
|
||||
public:
|
||||
virtual ~Directory() {}
|
||||
// Fsync directory
|
||||
virtual Status Fsync() = 0;
|
||||
};
|
||||
|
||||
enum InfoLogLevel : unsigned char {
|
||||
DEBUG_LEVEL = 0,
|
||||
INFO_LEVEL,
|
||||
WARN_LEVEL,
|
||||
ERROR_LEVEL,
|
||||
FATAL_LEVEL,
|
||||
NUM_INFO_LOG_LEVELS,
|
||||
};
|
||||
|
||||
// An interface for writing log messages.
|
||||
class Logger {
|
||||
public:
|
||||
enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
|
||||
explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
|
||||
: log_level_(log_level) {}
|
||||
virtual ~Logger();
|
||||
|
||||
// Write an entry to the log file with the specified format.
|
||||
virtual void Logv(const char* format, va_list ap) = 0;
|
||||
|
||||
// Write an entry to the log file with the specified log level
|
||||
// and format. Any log with level under the internal log level
|
||||
// of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
|
||||
// printed.
|
||||
void Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
|
||||
static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN",
|
||||
"ERROR", "FATAL"};
|
||||
if (log_level < log_level_) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (log_level == InfoLogLevel::INFO_LEVEL) {
|
||||
// Doesn't print log level if it is INFO level.
|
||||
// This is to avoid unexpected performance regression after we add
|
||||
// the feature of log level. All the logs before we add the feature
|
||||
// are INFO level. We don't want to add extra costs to those existing
|
||||
// logging.
|
||||
Logv(format, ap);
|
||||
} else {
|
||||
char new_format[500];
|
||||
snprintf(new_format, sizeof(new_format) - 1, "[%s] %s",
|
||||
kInfoLogLevelNames[log_level], format);
|
||||
Logv(new_format, ap);
|
||||
}
|
||||
}
|
||||
virtual size_t GetLogFileSize() const {
|
||||
return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
|
||||
}
|
||||
// Flush to the OS buffers
|
||||
virtual void Flush() {}
|
||||
virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
|
||||
virtual void SetInfoLogLevel(const InfoLogLevel log_level) {
|
||||
log_level_ = log_level;
|
||||
}
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Logger(const Logger&);
|
||||
void operator=(const Logger&);
|
||||
InfoLogLevel log_level_;
|
||||
};
|
||||
|
||||
|
||||
// Identifies a locked file.
|
||||
class FileLock {
|
||||
public:
|
||||
FileLock() { }
|
||||
virtual ~FileLock();
|
||||
private:
|
||||
// No copying allowed
|
||||
FileLock(const FileLock&);
|
||||
void operator=(const FileLock&);
|
||||
};
|
||||
|
||||
extern void LogFlush(const shared_ptr<Logger>& info_log);
|
||||
|
||||
extern void Log(const InfoLogLevel log_level,
|
||||
const shared_ptr<Logger>& info_log, const char* format, ...);
|
||||
|
||||
// a set of log functions with different log levels.
|
||||
extern void Debug(const shared_ptr<Logger>& info_log, const char* format, ...);
|
||||
extern void Info(const shared_ptr<Logger>& info_log, const char* format, ...);
|
||||
extern void Warn(const shared_ptr<Logger>& info_log, const char* format, ...);
|
||||
extern void Error(const shared_ptr<Logger>& info_log, const char* format, ...);
|
||||
extern void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...);
|
||||
|
||||
// Log the specified data to *info_log if info_log is non-nullptr.
|
||||
// The default info log level is InfoLogLevel::ERROR.
|
||||
extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
|
||||
# if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__format__ (__printf__, 2, 3)))
|
||||
# endif
|
||||
;
|
||||
|
||||
extern void LogFlush(Logger *info_log);
|
||||
|
||||
extern void Log(const InfoLogLevel log_level, Logger* info_log,
|
||||
const char* format, ...);
|
||||
|
||||
// The default info log level is InfoLogLevel::ERROR.
|
||||
extern void Log(Logger* info_log, const char* format, ...)
|
||||
# if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__format__ (__printf__, 2, 3)))
|
||||
# endif
|
||||
;
|
||||
|
||||
// a set of log functions with different log levels.
|
||||
extern void Debug(Logger* info_log, const char* format, ...);
|
||||
extern void Info(Logger* info_log, const char* format, ...);
|
||||
extern void Warn(Logger* info_log, const char* format, ...);
|
||||
extern void Error(Logger* info_log, const char* format, ...);
|
||||
extern void Fatal(Logger* info_log, const char* format, ...);
|
||||
|
||||
// A utility routine: write "data" to the named file.
|
||||
extern Status WriteStringToFile(Env* env, const Slice& data,
|
||||
const std::string& fname,
|
||||
bool should_sync = false);
|
||||
|
||||
// A utility routine: read contents of named file into *data
|
||||
extern Status ReadFileToString(Env* env, const std::string& fname,
|
||||
std::string* data);
|
||||
|
||||
// An implementation of Env that forwards all calls to another Env.
|
||||
// May be useful to clients who wish to override just part of the
|
||||
// functionality of another Env.
|
||||
class EnvWrapper : public Env {
|
||||
public:
|
||||
// Initialize an EnvWrapper that delegates all calls to *t
|
||||
explicit EnvWrapper(Env* t) : target_(t) { }
|
||||
virtual ~EnvWrapper();
|
||||
|
||||
// Return the target to which this Env forwards all calls
|
||||
Env* target() const { return target_; }
|
||||
|
||||
// The following text is boilerplate that forwards all methods to target()
|
||||
Status NewSequentialFile(const std::string& f,
|
||||
unique_ptr<SequentialFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewSequentialFile(f, r, options);
|
||||
}
|
||||
Status NewRandomAccessFile(const std::string& f,
|
||||
unique_ptr<RandomAccessFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewRandomAccessFile(f, r, options);
|
||||
}
|
||||
Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewWritableFile(f, r, options);
|
||||
}
|
||||
Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewRandomRWFile(f, r, options);
|
||||
}
|
||||
virtual Status NewDirectory(const std::string& name,
|
||||
unique_ptr<Directory>* result) {
|
||||
return target_->NewDirectory(name, result);
|
||||
}
|
||||
bool FileExists(const std::string& f) { return target_->FileExists(f); }
|
||||
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
|
||||
return target_->GetChildren(dir, r);
|
||||
}
|
||||
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
|
||||
Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
|
||||
Status CreateDirIfMissing(const std::string& d) {
|
||||
return target_->CreateDirIfMissing(d);
|
||||
}
|
||||
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
|
||||
Status GetFileSize(const std::string& f, uint64_t* s) {
|
||||
return target_->GetFileSize(f, s);
|
||||
}
|
||||
|
||||
Status GetFileModificationTime(const std::string& fname,
|
||||
uint64_t* file_mtime) {
|
||||
return target_->GetFileModificationTime(fname, file_mtime);
|
||||
}
|
||||
|
||||
Status RenameFile(const std::string& s, const std::string& t) {
|
||||
return target_->RenameFile(s, t);
|
||||
}
|
||||
Status LockFile(const std::string& f, FileLock** l) {
|
||||
return target_->LockFile(f, l);
|
||||
}
|
||||
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
|
||||
void Schedule(void (*f)(void*), void* a, Priority pri) {
|
||||
return target_->Schedule(f, a, pri);
|
||||
}
|
||||
void StartThread(void (*f)(void*), void* a) {
|
||||
return target_->StartThread(f, a);
|
||||
}
|
||||
void WaitForJoin() { return target_->WaitForJoin(); }
|
||||
virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
|
||||
return target_->GetThreadPoolQueueLen(pri);
|
||||
}
|
||||
virtual Status GetTestDirectory(std::string* path) {
|
||||
return target_->GetTestDirectory(path);
|
||||
}
|
||||
virtual Status NewLogger(const std::string& fname,
|
||||
shared_ptr<Logger>* result) {
|
||||
return target_->NewLogger(fname, result);
|
||||
}
|
||||
uint64_t NowMicros() {
|
||||
return target_->NowMicros();
|
||||
}
|
||||
void SleepForMicroseconds(int micros) {
|
||||
target_->SleepForMicroseconds(micros);
|
||||
}
|
||||
Status GetHostName(char* name, uint64_t len) {
|
||||
return target_->GetHostName(name, len);
|
||||
}
|
||||
Status GetCurrentTime(int64_t* unix_time) {
|
||||
return target_->GetCurrentTime(unix_time);
|
||||
}
|
||||
Status GetAbsolutePath(const std::string& db_path,
|
||||
std::string* output_path) {
|
||||
return target_->GetAbsolutePath(db_path, output_path);
|
||||
}
|
||||
void SetBackgroundThreads(int num, Priority pri) {
|
||||
return target_->SetBackgroundThreads(num, pri);
|
||||
}
|
||||
std::string TimeToString(uint64_t time) {
|
||||
return target_->TimeToString(time);
|
||||
}
|
||||
|
||||
private:
|
||||
Env* target_;
|
||||
};
|
||||
|
||||
// Returns a new environment that stores its data in memory and delegates
|
||||
// all non-file-storage tasks to base_env. The caller must delete the result
|
||||
// when it is no longer needed.
|
||||
// *base_env must remain live while the result is in use.
|
||||
Env* NewMemEnv(Env* base_env);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_
|
||||
74
include/rocksdb/filter_policy.h
Normal file
74
include/rocksdb/filter_policy.h
Normal file
@@ -0,0 +1,74 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A database can be configured with a custom FilterPolicy object.
|
||||
// This object is responsible for creating a small filter from a set
|
||||
// of keys. These filters are stored in rocksdb and are consulted
|
||||
// automatically by rocksdb to decide whether or not to read some
|
||||
// information from disk. In many cases, a filter can cut down the
|
||||
// number of disk seeks form a handful to a single disk seek per
|
||||
// DB::Get() call.
|
||||
//
|
||||
// Most people will want to use the builtin bloom filter support (see
|
||||
// NewBloomFilterPolicy() below).
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
|
||||
class FilterPolicy {
|
||||
public:
|
||||
virtual ~FilterPolicy();
|
||||
|
||||
// Return the name of this policy. Note that if the filter encoding
|
||||
// changes in an incompatible way, the name returned by this method
|
||||
// must be changed. Otherwise, old incompatible filters may be
|
||||
// passed to methods of this type.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// keys[0,n-1] contains a list of keys (potentially with duplicates)
|
||||
// that are ordered according to the user supplied comparator.
|
||||
// Append a filter that summarizes keys[0,n-1] to *dst.
|
||||
//
|
||||
// Warning: do not change the initial contents of *dst. Instead,
|
||||
// append the newly constructed filter to *dst.
|
||||
virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
|
||||
const = 0;
|
||||
|
||||
// "filter" contains the data appended by a preceding call to
|
||||
// CreateFilter() on this class. This method must return true if
|
||||
// the key was in the list of keys passed to CreateFilter().
|
||||
// This method may return true or false if the key was not on the
|
||||
// list, but it should aim to return false with a high probability.
|
||||
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
|
||||
};
|
||||
|
||||
// Return a new filter policy that uses a bloom filter with approximately
|
||||
// the specified number of bits per key. A good value for bits_per_key
|
||||
// is 10, which yields a filter with ~ 1% false positive rate.
|
||||
//
|
||||
// Callers must delete the result after any database that is using the
|
||||
// result has been closed.
|
||||
//
|
||||
// Note: if you are using a custom comparator that ignores some parts
|
||||
// of the keys being compared, you must not use NewBloomFilterPolicy()
|
||||
// and must provide your own FilterPolicy that also ignores the
|
||||
// corresponding parts of the keys. For example, if the comparator
|
||||
// ignores trailing spaces, it would be incorrect to use a
|
||||
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
|
||||
// trailing spaces in keys.
|
||||
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
|
||||
|
||||
}
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
|
||||
58
include/rocksdb/flush_block_policy.h
Normal file
58
include/rocksdb/flush_block_policy.h
Normal file
@@ -0,0 +1,58 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
class BlockBuilder;
|
||||
struct Options;
|
||||
|
||||
// FlushBlockPolicy provides a configurable way to determine when to flush a
|
||||
// block in the block based tables,
|
||||
class FlushBlockPolicy {
|
||||
public:
|
||||
// Keep track of the key/value sequences and return the boolean value to
|
||||
// determine if table builder should flush current data block.
|
||||
virtual bool Update(const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
|
||||
virtual ~FlushBlockPolicy() { }
|
||||
};
|
||||
|
||||
class FlushBlockPolicyFactory {
|
||||
public:
|
||||
// Return the name of the flush block policy.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// Return a new block flush policy that flushes data blocks by data size.
|
||||
// FlushBlockPolicy may need to access the metadata of the data block
|
||||
// builder to determine when to flush the blocks.
|
||||
//
|
||||
// Callers must delete the result after any database that is using the
|
||||
// result has been closed.
|
||||
virtual FlushBlockPolicy* NewFlushBlockPolicy(
|
||||
const Options& options, const BlockBuilder& data_block_builder) const = 0;
|
||||
|
||||
virtual ~FlushBlockPolicyFactory() { }
|
||||
};
|
||||
|
||||
class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
|
||||
public:
|
||||
FlushBlockBySizePolicyFactory() {}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "FlushBlockBySizePolicyFactory";
|
||||
}
|
||||
|
||||
virtual FlushBlockPolicy* NewFlushBlockPolicy(
|
||||
const Options& options,
|
||||
const BlockBuilder& data_block_builder) const override;
|
||||
};
|
||||
|
||||
} // rocksdb
|
||||
34
include/rocksdb/iostats_context.h
Normal file
34
include/rocksdb/iostats_context.h
Normal file
@@ -0,0 +1,34 @@
|
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
|
||||
#define INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
|
||||
// A thread local context for gathering io-stats efficiently and transparently.
|
||||
namespace rocksdb {
|
||||
|
||||
struct IOStatsContext {
|
||||
// reset all io-stats counter to zero
|
||||
void Reset();
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
// the thread pool id
|
||||
uint64_t thread_pool_id;
|
||||
|
||||
// number of bytes that has been written.
|
||||
uint64_t bytes_written;
|
||||
// number of bytes that has been read.
|
||||
uint64_t bytes_read;
|
||||
};
|
||||
|
||||
extern __thread IOStatsContext iostats_context;
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
|
||||
106
include/rocksdb/iterator.h
Normal file
106
include/rocksdb/iterator.h
Normal file
@@ -0,0 +1,106 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// An iterator yields a sequence of key/value pairs from a source.
|
||||
// The following class defines the interface. Multiple implementations
|
||||
// are provided by this library. In particular, iterators are provided
|
||||
// to access the contents of a Table or a DB.
|
||||
//
|
||||
// Multiple threads can invoke const methods on an Iterator without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same Iterator must use
|
||||
// external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
|
||||
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Iterator {
|
||||
public:
|
||||
Iterator();
|
||||
virtual ~Iterator();
|
||||
|
||||
// An iterator is either positioned at a key/value pair, or
|
||||
// not valid. This method returns true iff the iterator is valid.
|
||||
virtual bool Valid() const = 0;
|
||||
|
||||
// Position at the first key in the source. The iterator is Valid()
|
||||
// after this call iff the source is not empty.
|
||||
virtual void SeekToFirst() = 0;
|
||||
|
||||
// Position at the last key in the source. The iterator is
|
||||
// Valid() after this call iff the source is not empty.
|
||||
virtual void SeekToLast() = 0;
|
||||
|
||||
// Position at the first key in the source that at or past target
|
||||
// The iterator is Valid() after this call iff the source contains
|
||||
// an entry that comes at or past target.
|
||||
virtual void Seek(const Slice& target) = 0;
|
||||
|
||||
// Moves to the next entry in the source. After this call, Valid() is
|
||||
// true iff the iterator was not positioned at the last entry in the source.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Next() = 0;
|
||||
|
||||
// Moves to the previous entry in the source. After this call, Valid() is
|
||||
// true iff the iterator was not positioned at the first entry in source.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Prev() = 0;
|
||||
|
||||
// Return the key for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: Valid()
|
||||
virtual Slice key() const = 0;
|
||||
|
||||
// Return the value for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: !AtEnd() && !AtStart()
|
||||
virtual Slice value() const = 0;
|
||||
|
||||
// If an error has occurred, return it. Else return an ok status.
|
||||
// If non-blocking IO is requested and this operation cannot be
|
||||
// satisfied without doing some IO, then this returns Status::Incomplete().
|
||||
virtual Status status() const = 0;
|
||||
|
||||
// Clients are allowed to register function/arg1/arg2 triples that
|
||||
// will be invoked when this iterator is destroyed.
|
||||
//
|
||||
// Note that unlike all of the preceding methods, this method is
|
||||
// not abstract and therefore clients should not override it.
|
||||
typedef void (*CleanupFunction)(void* arg1, void* arg2);
|
||||
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
|
||||
|
||||
private:
|
||||
struct Cleanup {
|
||||
CleanupFunction function;
|
||||
void* arg1;
|
||||
void* arg2;
|
||||
Cleanup* next;
|
||||
};
|
||||
Cleanup cleanup_;
|
||||
|
||||
// No copying allowed
|
||||
Iterator(const Iterator&);
|
||||
void operator=(const Iterator&);
|
||||
};
|
||||
|
||||
// Return an empty iterator (yields nothing).
|
||||
extern Iterator* NewEmptyIterator();
|
||||
|
||||
// Return an empty iterator with the specified status.
|
||||
extern Iterator* NewErrorIterator(const Status& status);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
|
||||
37
include/rocksdb/ldb_tool.h
Normal file
37
include/rocksdb/ldb_tool.h
Normal file
@@ -0,0 +1,37 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#ifndef ROCKSDB_LITE
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include "rocksdb/options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// An interface for converting a slice to a readable string
|
||||
class SliceFormatter {
|
||||
public:
|
||||
virtual ~SliceFormatter() {}
|
||||
virtual std::string Format(const Slice& s) const = 0;
|
||||
};
|
||||
|
||||
// Options for customizing ldb tool (beyond the DB Options)
|
||||
struct LDBOptions {
|
||||
// Create LDBOptions with default values for all fields
|
||||
LDBOptions();
|
||||
|
||||
// Key formatter that converts a slice to a readable string.
|
||||
// Default: Slice::ToString()
|
||||
std::shared_ptr<SliceFormatter> key_formatter;
|
||||
};
|
||||
|
||||
class LDBTool {
|
||||
public:
|
||||
void Run(int argc, char** argv, Options db_options= Options(),
|
||||
const LDBOptions& ldb_options = LDBOptions());
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
287
include/rocksdb/memtablerep.h
Normal file
287
include/rocksdb/memtablerep.h
Normal file
@@ -0,0 +1,287 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// This file contains the interface that must be implemented by any collection
|
||||
// to be used as the backing store for a MemTable. Such a collection must
|
||||
// satisfy the following properties:
|
||||
// (1) It does not store duplicate items.
|
||||
// (2) It uses MemTableRep::KeyComparator to compare items for iteration and
|
||||
// equality.
|
||||
// (3) It can be accessed concurrently by multiple readers and can support
|
||||
// during reads. However, it needn't support multiple concurrent writes.
|
||||
// (4) Items are never deleted.
|
||||
// The liberal use of assertions is encouraged to enforce (1).
|
||||
//
|
||||
// The factory will be passed an Arena object when a new MemTableRep is
|
||||
// requested. The API for this object is in rocksdb/arena.h.
|
||||
//
|
||||
// Users can implement their own memtable representations. We include three
|
||||
// types built in:
|
||||
// - SkipListRep: This is the default; it is backed by a skip list.
|
||||
// - HashSkipListRep: The memtable rep that is best used for keys that are
|
||||
// structured like "prefix:suffix" where iteration within a prefix is
|
||||
// common and iteration across different prefixes is rare. It is backed by
|
||||
// a hash map where each bucket is a skip list.
|
||||
// - VectorRep: This is backed by an unordered std::vector. On iteration, the
|
||||
// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
|
||||
// has been called, the vector will only be sorted once. It is optimized for
|
||||
// random-write-heavy workloads.
|
||||
//
|
||||
// The last four implementations are designed for situations in which
|
||||
// iteration over the entire collection is rare since doing so requires all the
|
||||
// keys to be copied into a sorted data structure.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Arena;
|
||||
class LookupKey;
|
||||
class Slice;
|
||||
class SliceTransform;
|
||||
class Logger;
|
||||
|
||||
typedef void* KeyHandle;
|
||||
|
||||
class MemTableRep {
|
||||
public:
|
||||
// KeyComparator provides a means to compare keys, which are internal keys
|
||||
// concatenated with values.
|
||||
class KeyComparator {
|
||||
public:
|
||||
// Compare a and b. Return a negative value if a is less than b, 0 if they
|
||||
// are equal, and a positive value if a is greater than b
|
||||
virtual int operator()(const char* prefix_len_key1,
|
||||
const char* prefix_len_key2) const = 0;
|
||||
|
||||
virtual int operator()(const char* prefix_len_key,
|
||||
const Slice& key) const = 0;
|
||||
|
||||
virtual ~KeyComparator() { }
|
||||
};
|
||||
|
||||
explicit MemTableRep(Arena* arena) : arena_(arena) {}
|
||||
|
||||
// Allocate a buf of len size for storing key. The idea is that a specific
|
||||
// memtable representation knows its underlying data structure better. By
|
||||
// allowing it to allocate memory, it can possibly put correlated stuff
|
||||
// in consecutive memory area to make processor prefetching more efficient.
|
||||
virtual KeyHandle Allocate(const size_t len, char** buf);
|
||||
|
||||
// Insert key into the collection. (The caller will pack key and value into a
|
||||
// single buffer and pass that in as the parameter to Insert).
|
||||
// REQUIRES: nothing that compares equal to key is currently in the
|
||||
// collection.
|
||||
virtual void Insert(KeyHandle handle) = 0;
|
||||
|
||||
// Returns true iff an entry that compares equal to key is in the collection.
|
||||
virtual bool Contains(const char* key) const = 0;
|
||||
|
||||
// Notify this table rep that it will no longer be added to. By default, does
|
||||
// nothing.
|
||||
virtual void MarkReadOnly() { }
|
||||
|
||||
// Look up key from the mem table, since the first key in the mem table whose
|
||||
// user_key matches the one given k, call the function callback_func(), with
|
||||
// callback_args directly forwarded as the first parameter, and the mem table
|
||||
// key as the second parameter. If the return value is false, then terminates.
|
||||
// Otherwise, go through the next key.
|
||||
// It's safe for Get() to terminate after having finished all the potential
|
||||
// key for the k.user_key(), or not.
|
||||
//
|
||||
// Default:
|
||||
// Get() function with a default value of dynamically construct an iterator,
|
||||
// seek and call the call back function.
|
||||
virtual void Get(const LookupKey& k, void* callback_args,
|
||||
bool (*callback_func)(void* arg, const char* entry));
|
||||
|
||||
// Report an approximation of how much memory has been used other than memory
|
||||
// that was allocated through the arena.
|
||||
virtual size_t ApproximateMemoryUsage() = 0;
|
||||
|
||||
virtual ~MemTableRep() { }
|
||||
|
||||
// Iteration over the contents of a skip collection
|
||||
class Iterator {
|
||||
public:
|
||||
// Initialize an iterator over the specified collection.
|
||||
// The returned iterator is not valid.
|
||||
// explicit Iterator(const MemTableRep* collection);
|
||||
virtual ~Iterator() {}
|
||||
|
||||
// Returns true iff the iterator is positioned at a valid node.
|
||||
virtual bool Valid() const = 0;
|
||||
|
||||
// Returns the key at the current position.
|
||||
// REQUIRES: Valid()
|
||||
virtual const char* key() const = 0;
|
||||
|
||||
// Advances to the next position.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Next() = 0;
|
||||
|
||||
// Advances to the previous position.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Prev() = 0;
|
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
|
||||
|
||||
// Position at the first entry in collection.
|
||||
// Final state of iterator is Valid() iff collection is not empty.
|
||||
virtual void SeekToFirst() = 0;
|
||||
|
||||
// Position at the last entry in collection.
|
||||
// Final state of iterator is Valid() iff collection is not empty.
|
||||
virtual void SeekToLast() = 0;
|
||||
};
|
||||
|
||||
// Return an iterator over the keys in this representation.
|
||||
// arena: If not null, the arena needs to be used to allocate the Iterator.
|
||||
// When destroying the iterator, the caller will not call "delete"
|
||||
// but Iterator::~Iterator() directly. The destructor needs to destroy
|
||||
// all the states but those allocated in arena.
|
||||
virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
|
||||
|
||||
// Return an iterator that has a special Seek semantics. The result of
|
||||
// a Seek might only include keys with the same prefix as the target key.
|
||||
// arena: If not null, the arena needs to be used to allocate the Iterator.
|
||||
// When destroying the iterator, the caller will not call "delete"
|
||||
// but Iterator::~Iterator() directly. The destructor needs to destroy
|
||||
// all the states but those allocated in arena.
|
||||
virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
|
||||
return GetIterator(arena);
|
||||
}
|
||||
|
||||
// Return true if the current MemTableRep supports merge operator.
|
||||
// Default: true
|
||||
virtual bool IsMergeOperatorSupported() const { return true; }
|
||||
|
||||
// Return true if the current MemTableRep supports snapshot
|
||||
// Default: true
|
||||
virtual bool IsSnapshotSupported() const { return true; }
|
||||
|
||||
protected:
|
||||
// When *key is an internal key concatenated with the value, returns the
|
||||
// user key.
|
||||
virtual Slice UserKey(const char* key) const;
|
||||
|
||||
Arena* arena_;
|
||||
};
|
||||
|
||||
// This is the base class for all factories that are used by RocksDB to create
|
||||
// new MemTableRep objects
|
||||
class MemTableRepFactory {
|
||||
public:
|
||||
virtual ~MemTableRepFactory() {}
|
||||
virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
|
||||
Arena*, const SliceTransform*,
|
||||
Logger* logger) = 0;
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// This uses a skip list to store keys. It is the default.
|
||||
class SkipListFactory : public MemTableRepFactory {
|
||||
public:
|
||||
virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
|
||||
Arena*, const SliceTransform*,
|
||||
Logger* logger) override;
|
||||
virtual const char* Name() const override { return "SkipListFactory"; }
|
||||
};
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
// This creates MemTableReps that are backed by an std::vector. On iteration,
|
||||
// the vector is sorted. This is useful for workloads where iteration is very
|
||||
// rare and writes are generally not issued after reads begin.
|
||||
//
|
||||
// Parameters:
|
||||
// count: Passed to the constructor of the underlying std::vector of each
|
||||
// VectorRep. On initialization, the underlying array will be at least count
|
||||
// bytes reserved for usage.
|
||||
class VectorRepFactory : public MemTableRepFactory {
|
||||
const size_t count_;
|
||||
|
||||
public:
|
||||
explicit VectorRepFactory(size_t count = 0) : count_(count) { }
|
||||
virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
|
||||
Arena*, const SliceTransform*,
|
||||
Logger* logger) override;
|
||||
virtual const char* Name() const override {
|
||||
return "VectorRepFactory";
|
||||
}
|
||||
};
|
||||
|
||||
// This class contains a fixed array of buckets, each
|
||||
// pointing to a skiplist (null if the bucket is empty).
|
||||
// bucket_count: number of fixed array buckets
|
||||
// skiplist_height: the max height of the skiplist
|
||||
// skiplist_branching_factor: probabilistic size ratio between adjacent
|
||||
// link lists in the skiplist
|
||||
extern MemTableRepFactory* NewHashSkipListRepFactory(
|
||||
size_t bucket_count = 1000000, int32_t skiplist_height = 4,
|
||||
int32_t skiplist_branching_factor = 4
|
||||
);
|
||||
|
||||
// The factory is to create memtables based on a hash table:
|
||||
// it contains a fixed array of buckets, each pointing to either a linked list
|
||||
// or a skip list if number of entries inside the bucket exceeds
|
||||
// threshold_use_skiplist.
|
||||
// @bucket_count: number of fixed array buckets
|
||||
// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
|
||||
// Otherwise from huge page TLB. The user needs to reserve
|
||||
// huge pages for it to be allocated, like:
|
||||
// sysctl -w vm.nr_hugepages=20
|
||||
// See linux doc Documentation/vm/hugetlbpage.txt
|
||||
// @bucket_entries_logging_threshold: if number of entries in one bucket
|
||||
// exceeds this number, log about it.
|
||||
// @if_log_bucket_dist_when_flash: if true, log distribution of number of
|
||||
// entries when flushing.
|
||||
// @threshold_use_skiplist: a bucket switches to skip list if number of
|
||||
// entries exceed this parameter.
|
||||
extern MemTableRepFactory* NewHashLinkListRepFactory(
|
||||
size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
|
||||
int bucket_entries_logging_threshold = 4096,
|
||||
bool if_log_bucket_dist_when_flash = true,
|
||||
uint32_t threshold_use_skiplist = 256);
|
||||
|
||||
// This factory creates a cuckoo-hashing based mem-table representation.
|
||||
// Cuckoo-hash is a closed-hash strategy, in which all key/value pairs
|
||||
// are stored in the bucket array itself intead of in some data structures
|
||||
// external to the bucket array. In addition, each key in cuckoo hash
|
||||
// has a constant number of possible buckets in the bucket array. These
|
||||
// two properties together makes cuckoo hash more memory efficient and
|
||||
// a constant worst-case read time. Cuckoo hash is best suitable for
|
||||
// point-lookup workload.
|
||||
//
|
||||
// When inserting a key / value, it first checks whether one of its possible
|
||||
// buckets is empty. If so, the key / value will be inserted to that vacant
|
||||
// bucket. Otherwise, one of the keys originally stored in one of these
|
||||
// possible buckets will be "kicked out" and move to one of its possible
|
||||
// buckets (and possibly kicks out another victim.) In the current
|
||||
// implementation, such "kick-out" path is bounded. If it cannot find a
|
||||
// "kick-out" path for a specific key, this key will be stored in a backup
|
||||
// structure, and the current memtable to be forced to immutable.
|
||||
//
|
||||
// Note that currently this mem-table representation does not support
|
||||
// snapshot (i.e., it only queries latest state) and iterators. In addition,
|
||||
// MultiGet operation might also lose its atomicity due to the lack of
|
||||
// snapshot support.
|
||||
//
|
||||
// Parameters:
|
||||
// write_buffer_size: the write buffer size in bytes.
|
||||
// average_data_size: the average size of key + value in bytes. This value
|
||||
// together with write_buffer_size will be used to compute the number
|
||||
// of buckets.
|
||||
// hash_function_count: the number of hash functions that will be used by
|
||||
// the cuckoo-hash. The number also equals to the number of possible
|
||||
// buckets each key will have.
|
||||
extern MemTableRepFactory* NewHashCuckooRepFactory(
|
||||
size_t write_buffer_size, size_t average_data_size = 64,
|
||||
unsigned int hash_function_count = 4);
|
||||
#endif // ROCKSDB_LITE
|
||||
} // namespace rocksdb
|
||||
182
include/rocksdb/merge_operator.h
Normal file
182
include/rocksdb/merge_operator.h
Normal file
@@ -0,0 +1,182 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <deque>
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
class Logger;
|
||||
|
||||
// The Merge Operator
|
||||
//
|
||||
// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
|
||||
// client knows. It could be numeric addition, list append, string
|
||||
// concatenation, edit data structure, ... , anything.
|
||||
// The library, on the other hand, is concerned with the exercise of this
|
||||
// interface, at the right time (during get, iteration, compaction...)
|
||||
//
|
||||
// To use merge, the client needs to provide an object implementing one of
|
||||
// the following interfaces:
|
||||
// a) AssociativeMergeOperator - for most simple semantics (always take
|
||||
// two values, and merge them into one value, which is then put back
|
||||
// into rocksdb); numeric addition and string concatenation are examples;
|
||||
//
|
||||
// b) MergeOperator - the generic class for all the more abstract / complex
|
||||
// operations; one method (FullMerge) to merge a Put/Delete value with a
|
||||
// merge operand; and another method (PartialMerge) that merges multiple
|
||||
// operands together. this is especially useful if your key values have
|
||||
// complex structures but you would still like to support client-specific
|
||||
// incremental updates.
|
||||
//
|
||||
// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
|
||||
// more powerful.
|
||||
//
|
||||
// Refer to rocksdb-merge wiki for more details and example implementations.
|
||||
//
|
||||
class MergeOperator {
|
||||
public:
|
||||
virtual ~MergeOperator() {}
|
||||
|
||||
// Gives the client a way to express the read -> modify -> write semantics
|
||||
// key: (IN) The key that's associated with this merge operation.
|
||||
// Client could multiplex the merge operator based on it
|
||||
// if the key space is partitioned and different subspaces
|
||||
// refer to different types of data which have different
|
||||
// merge operation semantics
|
||||
// existing: (IN) null indicates that the key does not exist before this op
|
||||
// operand_list:(IN) the sequence of merge operations to apply, front() first.
|
||||
// new_value:(OUT) Client is responsible for filling the merge result here
|
||||
// logger: (IN) Client could use this to log errors during merge.
|
||||
//
|
||||
// Return true on success.
|
||||
// All values passed in will be client-specific values. So if this method
|
||||
// returns false, it is because client specified bad data or there was
|
||||
// internal corruption. This will be treated as an error by the library.
|
||||
//
|
||||
// Also make use of the *logger for error messages.
|
||||
virtual bool FullMerge(const Slice& key,
|
||||
const Slice* existing_value,
|
||||
const std::deque<std::string>& operand_list,
|
||||
std::string* new_value,
|
||||
Logger* logger) const = 0;
|
||||
|
||||
// This function performs merge(left_op, right_op)
|
||||
// when both the operands are themselves merge operation types
|
||||
// that you would have passed to a DB::Merge() call in the same order
|
||||
// (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
|
||||
//
|
||||
// PartialMerge should combine them into a single merge operation that is
|
||||
// saved into *new_value, and then it should return true.
|
||||
// *new_value should be constructed such that a call to
|
||||
// DB::Merge(key, *new_value) would yield the same result as a call
|
||||
// to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
|
||||
//
|
||||
// The default implementation of PartialMergeMulti will use this function
|
||||
// as a helper, for backward compatibility. Any successor class of
|
||||
// MergeOperator should either implement PartialMerge or PartialMergeMulti,
|
||||
// although implementing PartialMergeMulti is suggested as it is in general
|
||||
// more effective to merge multiple operands at a time instead of two
|
||||
// operands at a time.
|
||||
//
|
||||
// If it is impossible or infeasible to combine the two operations,
|
||||
// leave new_value unchanged and return false. The library will
|
||||
// internally keep track of the operations, and apply them in the
|
||||
// correct order once a base-value (a Put/Delete/End-of-Database) is seen.
|
||||
//
|
||||
// TODO: Presently there is no way to differentiate between error/corruption
|
||||
// and simply "return false". For now, the client should simply return
|
||||
// false in any case it cannot perform partial-merge, regardless of reason.
|
||||
// If there is corruption in the data, handle it in the FullMerge() function,
|
||||
// and return false there. The default implementation of PartialMerge will
|
||||
// always return false.
|
||||
virtual bool PartialMerge(const Slice& key, const Slice& left_operand,
|
||||
const Slice& right_operand, std::string* new_value,
|
||||
Logger* logger) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
// This function performs merge when all the operands are themselves merge
|
||||
// operation types that you would have passed to a DB::Merge() call in the
|
||||
// same order (front() first)
|
||||
// (i.e. DB::Merge(key, operand_list[0]), followed by
|
||||
// DB::Merge(key, operand_list[1]), ...)
|
||||
//
|
||||
// PartialMergeMulti should combine them into a single merge operation that is
|
||||
// saved into *new_value, and then it should return true. *new_value should
|
||||
// be constructed such that a call to DB::Merge(key, *new_value) would yield
|
||||
// the same result as subquential individual calls to DB::Merge(key, operand)
|
||||
// for each operand in operand_list from front() to back().
|
||||
//
|
||||
// The PartialMergeMulti function will be called only when the list of
|
||||
// operands are long enough. The minimum amount of operands that will be
|
||||
// passed to the function are specified by the "min_partial_merge_operands"
|
||||
// option.
|
||||
//
|
||||
// In the default implementation, PartialMergeMulti will invoke PartialMerge
|
||||
// multiple times, where each time it only merges two operands. Developers
|
||||
// should either implement PartialMergeMulti, or implement PartialMerge which
|
||||
// is served as the helper function of the default PartialMergeMulti.
|
||||
virtual bool PartialMergeMulti(const Slice& key,
|
||||
const std::deque<Slice>& operand_list,
|
||||
std::string* new_value, Logger* logger) const;
|
||||
|
||||
// The name of the MergeOperator. Used to check for MergeOperator
|
||||
// mismatches (i.e., a DB created with one MergeOperator is
|
||||
// accessed using a different MergeOperator)
|
||||
// TODO: the name is currently not stored persistently and thus
|
||||
// no checking is enforced. Client is responsible for providing
|
||||
// consistent MergeOperator between DB opens.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// The simpler, associative merge operator.
|
||||
class AssociativeMergeOperator : public MergeOperator {
|
||||
public:
|
||||
virtual ~AssociativeMergeOperator() {}
|
||||
|
||||
// Gives the client a way to express the read -> modify -> write semantics
|
||||
// key: (IN) The key that's associated with this merge operation.
|
||||
// existing_value:(IN) null indicates the key does not exist before this op
|
||||
// value: (IN) the value to update/merge the existing_value with
|
||||
// new_value: (OUT) Client is responsible for filling the merge result here
|
||||
// logger: (IN) Client could use this to log errors during merge.
|
||||
//
|
||||
// Return true on success.
|
||||
// All values passed in will be client-specific values. So if this method
|
||||
// returns false, it is because client specified bad data or there was
|
||||
// internal corruption. The client should assume that this will be treated
|
||||
// as an error by the library.
|
||||
virtual bool Merge(const Slice& key,
|
||||
const Slice* existing_value,
|
||||
const Slice& value,
|
||||
std::string* new_value,
|
||||
Logger* logger) const = 0;
|
||||
|
||||
|
||||
private:
|
||||
// Default implementations of the MergeOperator functions
|
||||
virtual bool FullMerge(const Slice& key,
|
||||
const Slice* existing_value,
|
||||
const std::deque<std::string>& operand_list,
|
||||
std::string* new_value,
|
||||
Logger* logger) const override;
|
||||
|
||||
virtual bool PartialMerge(const Slice& key,
|
||||
const Slice& left_operand,
|
||||
const Slice& right_operand,
|
||||
std::string* new_value,
|
||||
Logger* logger) const override;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
|
||||
1043
include/rocksdb/options.h
Normal file
1043
include/rocksdb/options.h
Normal file
File diff suppressed because it is too large
Load Diff
78
include/rocksdb/perf_context.h
Normal file
78
include/rocksdb/perf_context.h
Normal file
@@ -0,0 +1,78 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
|
||||
#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
enum PerfLevel {
|
||||
kDisable = 0, // disable perf stats
|
||||
kEnableCount = 1, // enable only count stats
|
||||
kEnableTime = 2 // enable time stats too
|
||||
};
|
||||
|
||||
// set the perf stats level
|
||||
void SetPerfLevel(PerfLevel level);
|
||||
|
||||
// get current perf stats level
|
||||
PerfLevel GetPerfLevel();
|
||||
|
||||
// A thread local context for gathering performance counter efficiently
|
||||
// and transparently.
|
||||
|
||||
struct PerfContext {
|
||||
|
||||
void Reset(); // reset all performance counters to zero
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
uint64_t user_key_comparison_count; // total number of user key comparisons
|
||||
uint64_t block_cache_hit_count; // total number of block cache hits
|
||||
uint64_t block_read_count; // total number of block reads (with IO)
|
||||
uint64_t block_read_byte; // total number of bytes from block reads
|
||||
uint64_t block_read_time; // total time spent on block reads
|
||||
uint64_t block_checksum_time; // total time spent on block checksum
|
||||
uint64_t block_decompress_time; // total time spent on block decompression
|
||||
// total number of internal keys skipped over during iteration (overwritten or
|
||||
// deleted, to be more specific, hidden by a put or delete of the same key)
|
||||
uint64_t internal_key_skipped_count;
|
||||
// total number of deletes skipped over during iteration
|
||||
uint64_t internal_delete_skipped_count;
|
||||
|
||||
uint64_t get_snapshot_time; // total time spent on getting snapshot
|
||||
uint64_t get_from_memtable_time; // total time spent on querying memtables
|
||||
uint64_t get_from_memtable_count; // number of mem tables queried
|
||||
// total time spent after Get() finds a key
|
||||
uint64_t get_post_process_time;
|
||||
uint64_t get_from_output_files_time; // total time reading from output files
|
||||
// total time spent on seeking child iters
|
||||
uint64_t seek_child_seek_time;
|
||||
// number of seek issued in child iterators
|
||||
uint64_t seek_child_seek_count;
|
||||
uint64_t seek_min_heap_time; // total time spent on the merge heap
|
||||
// total time spent on seeking the internal entries
|
||||
uint64_t seek_internal_seek_time;
|
||||
// total time spent on iterating internal entries to find the next user entry
|
||||
uint64_t find_next_user_entry_time;
|
||||
// total time spent on pre or post processing when writing a record
|
||||
uint64_t write_pre_and_post_process_time;
|
||||
uint64_t write_wal_time; // total time spent on writing to WAL
|
||||
// total time spent on writing to mem tables
|
||||
uint64_t write_memtable_time;
|
||||
};
|
||||
|
||||
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
|
||||
extern PerfContext perf_context;
|
||||
#else
|
||||
extern __thread PerfContext perf_context;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
60
include/rocksdb/rate_limiter.h
Normal file
60
include/rocksdb/rate_limiter.h
Normal file
@@ -0,0 +1,60 @@
|
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "rocksdb/env.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class RateLimiter {
|
||||
public:
|
||||
virtual ~RateLimiter() {}
|
||||
|
||||
// Request for token to write bytes. If this request can not be satisfied,
|
||||
// the call is blocked. Caller is responsible to make sure
|
||||
// bytes < GetSingleBurstBytes()
|
||||
virtual void Request(const int64_t bytes, const Env::IOPriority pri) = 0;
|
||||
|
||||
// Max bytes can be granted in a single burst
|
||||
virtual int64_t GetSingleBurstBytes() const = 0;
|
||||
|
||||
// Total bytes that go though rate limiter
|
||||
virtual int64_t GetTotalBytesThrough(
|
||||
const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
|
||||
|
||||
// Total # of requests that go though rate limiter
|
||||
virtual int64_t GetTotalRequests(
|
||||
const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
|
||||
};
|
||||
|
||||
// Create a RateLimiter object, which can be shared among RocksDB instances to
|
||||
// control write rate of flush and compaction.
|
||||
// @rate_bytes_per_sec: this is the only parameter you want to set most of the
|
||||
// time. It controls the total write rate of compaction and flush in bytes per
|
||||
// second. Currently, RocksDB does not enforce rate limit for anything other
|
||||
// than flush and compaction, e.g. write to WAL.
|
||||
// @refill_period_us: this controls how often tokens are refilled. For example,
|
||||
// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
|
||||
// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to
|
||||
// burstier writes while smaller value introduces more CPU overhead.
|
||||
// The default should work for most cases.
|
||||
// @fairness: RateLimiter accepts high-pri requests and low-pri requests.
|
||||
// A low-pri request is usually blocked in favor of hi-pri request. Currently,
|
||||
// RocksDB assigns low-pri to request from compaciton and high-pri to request
|
||||
// from flush. Low-pri requests can get blocked if flush requests come in
|
||||
// continuouly. This fairness parameter grants low-pri requests permission by
|
||||
// 1/fairness chance even though high-pri requests exist to avoid starvation.
|
||||
// You should be good by leaving it at default 10.
|
||||
extern RateLimiter* NewGenericRateLimiter(
|
||||
int64_t rate_bytes_per_sec,
|
||||
int64_t refill_period_us = 100 * 1000,
|
||||
int32_t fairness = 10);
|
||||
|
||||
} // namespace rocksdb
|
||||
137
include/rocksdb/slice.h
Normal file
137
include/rocksdb/slice.h
Normal file
@@ -0,0 +1,137 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Slice is a simple structure containing a pointer into some external
|
||||
// storage and a size. The user of a Slice must ensure that the slice
|
||||
// is not used after the corresponding external storage has been
|
||||
// deallocated.
|
||||
//
|
||||
// Multiple threads can invoke const methods on a Slice without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same Slice must use
|
||||
// external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice {
|
||||
public:
|
||||
// Create an empty slice.
|
||||
Slice() : data_(""), size_(0) { }
|
||||
|
||||
// Create a slice that refers to d[0,n-1].
|
||||
Slice(const char* d, size_t n) : data_(d), size_(n) { }
|
||||
|
||||
// Create a slice that refers to the contents of "s"
|
||||
/* implicit */
|
||||
Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
|
||||
|
||||
// Create a slice that refers to s[0,strlen(s)-1]
|
||||
/* implicit */
|
||||
Slice(const char* s) : data_(s), size_(strlen(s)) { }
|
||||
|
||||
// Return a pointer to the beginning of the referenced data
|
||||
const char* data() const { return data_; }
|
||||
|
||||
// Return the length (in bytes) of the referenced data
|
||||
size_t size() const { return size_; }
|
||||
|
||||
// Return true iff the length of the referenced data is zero
|
||||
bool empty() const { return size_ == 0; }
|
||||
|
||||
// Return the ith byte in the referenced data.
|
||||
// REQUIRES: n < size()
|
||||
char operator[](size_t n) const {
|
||||
assert(n < size());
|
||||
return data_[n];
|
||||
}
|
||||
|
||||
// Change this slice to refer to an empty array
|
||||
void clear() { data_ = ""; size_ = 0; }
|
||||
|
||||
// Drop the first "n" bytes from this slice.
|
||||
void remove_prefix(size_t n) {
|
||||
assert(n <= size());
|
||||
data_ += n;
|
||||
size_ -= n;
|
||||
}
|
||||
|
||||
// Return a string that contains the copy of the referenced data.
|
||||
std::string ToString(bool hex = false) const {
|
||||
if (hex) {
|
||||
std::string result;
|
||||
char buf[10];
|
||||
for (size_t i = 0; i < size_; i++) {
|
||||
snprintf(buf, 10, "%02X", (unsigned char)data_[i]);
|
||||
result += buf;
|
||||
}
|
||||
return result;
|
||||
} else {
|
||||
return std::string(data_, size_);
|
||||
}
|
||||
}
|
||||
|
||||
// Three-way comparison. Returns value:
|
||||
// < 0 iff "*this" < "b",
|
||||
// == 0 iff "*this" == "b",
|
||||
// > 0 iff "*this" > "b"
|
||||
int compare(const Slice& b) const;
|
||||
|
||||
// Return true iff "x" is a prefix of "*this"
|
||||
bool starts_with(const Slice& x) const {
|
||||
return ((size_ >= x.size_) &&
|
||||
(memcmp(data_, x.data_, x.size_) == 0));
|
||||
}
|
||||
|
||||
// private: make these public for rocksdbjni access
|
||||
const char* data_;
|
||||
size_t size_;
|
||||
|
||||
// Intentionally copyable
|
||||
};
|
||||
|
||||
// A set of Slices that are virtually concatenated together. 'parts' points
|
||||
// to an array of Slices. The number of elements in the array is 'num_parts'.
|
||||
struct SliceParts {
|
||||
SliceParts(const Slice* _parts, int _num_parts) :
|
||||
parts(_parts), num_parts(_num_parts) { }
|
||||
SliceParts() : parts(nullptr), num_parts(0) {}
|
||||
|
||||
const Slice* parts;
|
||||
int num_parts;
|
||||
};
|
||||
|
||||
inline bool operator==(const Slice& x, const Slice& y) {
|
||||
return ((x.size() == y.size()) &&
|
||||
(memcmp(x.data(), y.data(), x.size()) == 0));
|
||||
}
|
||||
|
||||
inline bool operator!=(const Slice& x, const Slice& y) {
|
||||
return !(x == y);
|
||||
}
|
||||
|
||||
inline int Slice::compare(const Slice& b) const {
|
||||
const int min_len = (size_ < b.size_) ? size_ : b.size_;
|
||||
int r = memcmp(data_, b.data_, min_len);
|
||||
if (r == 0) {
|
||||
if (size_ < b.size_) r = -1;
|
||||
else if (size_ > b.size_) r = +1;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_H_
|
||||
47
include/rocksdb/slice_transform.h
Normal file
47
include/rocksdb/slice_transform.h
Normal file
@@ -0,0 +1,47 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Class for specifying user-defined functions which perform a
|
||||
// transformation on a slice. It is not required that every slice
|
||||
// belong to the domain and/or range of a function. Subclasses should
|
||||
// define InDomain and InRange to determine which slices are in either
|
||||
// of these sets respectively.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
|
||||
class SliceTransform {
|
||||
public:
|
||||
virtual ~SliceTransform() {};
|
||||
|
||||
// Return the name of this transformation.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// transform a src in domain to a dst in the range
|
||||
virtual Slice Transform(const Slice& src) const = 0;
|
||||
|
||||
// determine whether this is a valid src upon the function applies
|
||||
virtual bool InDomain(const Slice& src) const = 0;
|
||||
|
||||
// determine whether dst=Transform(src) for some src
|
||||
virtual bool InRange(const Slice& dst) const = 0;
|
||||
};
|
||||
|
||||
extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
|
||||
|
||||
extern const SliceTransform* NewNoopTransform();
|
||||
|
||||
}
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
|
||||
281
include/rocksdb/statistics.h
Normal file
281
include/rocksdb/statistics.h
Normal file
@@ -0,0 +1,281 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
|
||||
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
/**
|
||||
* Keep adding ticker's here.
|
||||
* 1. Any ticker should be added before TICKER_ENUM_MAX.
|
||||
* 2. Add a readable string in TickersNameMap below for the newly added ticker.
|
||||
*/
|
||||
enum Tickers : uint32_t {
|
||||
// total block cache misses
|
||||
// REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
|
||||
// BLOCK_CACHE_FILTER_MISS +
|
||||
// BLOCK_CACHE_DATA_MISS;
|
||||
BLOCK_CACHE_MISS = 0,
|
||||
// total block cache hit
|
||||
// REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
|
||||
// BLOCK_CACHE_FILTER_HIT +
|
||||
// BLOCK_CACHE_DATA_HIT;
|
||||
BLOCK_CACHE_HIT,
|
||||
// # of blocks added to block cache.
|
||||
BLOCK_CACHE_ADD,
|
||||
// # of times cache miss when accessing index block from block cache.
|
||||
BLOCK_CACHE_INDEX_MISS,
|
||||
// # of times cache hit when accessing index block from block cache.
|
||||
BLOCK_CACHE_INDEX_HIT,
|
||||
// # of times cache miss when accessing filter block from block cache.
|
||||
BLOCK_CACHE_FILTER_MISS,
|
||||
// # of times cache hit when accessing filter block from block cache.
|
||||
BLOCK_CACHE_FILTER_HIT,
|
||||
// # of times cache miss when accessing data block from block cache.
|
||||
BLOCK_CACHE_DATA_MISS,
|
||||
// # of times cache hit when accessing data block from block cache.
|
||||
BLOCK_CACHE_DATA_HIT,
|
||||
// # of times bloom filter has avoided file reads.
|
||||
BLOOM_FILTER_USEFUL,
|
||||
|
||||
// # of memtable hits.
|
||||
MEMTABLE_HIT,
|
||||
// # of memtable misses.
|
||||
MEMTABLE_MISS,
|
||||
|
||||
/**
|
||||
* COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
|
||||
* There are 3 reasons currently.
|
||||
*/
|
||||
COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
|
||||
COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
|
||||
COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
|
||||
|
||||
// Number of keys written to the database via the Put and Write call's
|
||||
NUMBER_KEYS_WRITTEN,
|
||||
// Number of Keys read,
|
||||
NUMBER_KEYS_READ,
|
||||
// Number keys updated, if inplace update is enabled
|
||||
NUMBER_KEYS_UPDATED,
|
||||
// Bytes written / read
|
||||
BYTES_WRITTEN,
|
||||
BYTES_READ,
|
||||
NO_FILE_CLOSES,
|
||||
NO_FILE_OPENS,
|
||||
NO_FILE_ERRORS,
|
||||
// Time system had to wait to do LO-L1 compactions
|
||||
STALL_L0_SLOWDOWN_MICROS,
|
||||
// Time system had to wait to move memtable to L1.
|
||||
STALL_MEMTABLE_COMPACTION_MICROS,
|
||||
// write throttle because of too many files in L0
|
||||
STALL_L0_NUM_FILES_MICROS,
|
||||
RATE_LIMIT_DELAY_MILLIS,
|
||||
NO_ITERATORS, // number of iterators currently open
|
||||
|
||||
// Number of MultiGet calls, keys read, and bytes read
|
||||
NUMBER_MULTIGET_CALLS,
|
||||
NUMBER_MULTIGET_KEYS_READ,
|
||||
NUMBER_MULTIGET_BYTES_READ,
|
||||
|
||||
// Number of deletes records that were not required to be
|
||||
// written to storage because key does not exist
|
||||
NUMBER_FILTERED_DELETES,
|
||||
NUMBER_MERGE_FAILURES,
|
||||
SEQUENCE_NUMBER,
|
||||
|
||||
// number of times bloom was checked before creating iterator on a
|
||||
// file, and the number of times the check was useful in avoiding
|
||||
// iterator creation (and thus likely IOPs).
|
||||
BLOOM_FILTER_PREFIX_CHECKED,
|
||||
BLOOM_FILTER_PREFIX_USEFUL,
|
||||
|
||||
// Number of times we had to reseek inside an iteration to skip
|
||||
// over large number of keys with same userkey.
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION,
|
||||
|
||||
// Record the number of calls to GetUpadtesSince. Useful to keep track of
|
||||
// transaction log iterator refreshes
|
||||
GET_UPDATES_SINCE_CALLS,
|
||||
BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
|
||||
BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
|
||||
WAL_FILE_SYNCED, // Number of times WAL sync is done
|
||||
WAL_FILE_BYTES, // Number of bytes written to WAL
|
||||
|
||||
// Writes can be processed by requesting thread or by the thread at the
|
||||
// head of the writers queue.
|
||||
WRITE_DONE_BY_SELF,
|
||||
WRITE_DONE_BY_OTHER,
|
||||
WRITE_TIMEDOUT, // Number of writes ending up with timed-out.
|
||||
WRITE_WITH_WAL, // Number of Write calls that request WAL
|
||||
COMPACT_READ_BYTES, // Bytes read during compaction
|
||||
COMPACT_WRITE_BYTES, // Bytes written during compaction
|
||||
FLUSH_WRITE_BYTES, // Bytes written during flush
|
||||
|
||||
// Number of table's properties loaded directly from file, without creating
|
||||
// table reader object.
|
||||
NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
|
||||
NUMBER_SUPERVERSION_ACQUIRES,
|
||||
NUMBER_SUPERVERSION_RELEASES,
|
||||
NUMBER_SUPERVERSION_CLEANUPS,
|
||||
NUMBER_BLOCK_NOT_COMPRESSED,
|
||||
TICKER_ENUM_MAX
|
||||
};
|
||||
|
||||
// The order of items listed in Tickers should be the same as
|
||||
// the order listed in TickersNameMap
|
||||
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
||||
{BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
|
||||
{BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
|
||||
{BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
|
||||
{BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
|
||||
{BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
|
||||
{BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
|
||||
{BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
|
||||
{BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
|
||||
{BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
|
||||
{BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
|
||||
{MEMTABLE_HIT, "rocksdb.memtable.hit"},
|
||||
{MEMTABLE_MISS, "rocksdb.memtable.miss"},
|
||||
{COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
|
||||
{COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
|
||||
{COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
|
||||
{NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
|
||||
{NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
|
||||
{NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
|
||||
{BYTES_WRITTEN, "rocksdb.bytes.written"},
|
||||
{BYTES_READ, "rocksdb.bytes.read"},
|
||||
{NO_FILE_CLOSES, "rocksdb.no.file.closes"},
|
||||
{NO_FILE_OPENS, "rocksdb.no.file.opens"},
|
||||
{NO_FILE_ERRORS, "rocksdb.no.file.errors"},
|
||||
{STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
|
||||
{STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
|
||||
{STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
|
||||
{RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
|
||||
{NO_ITERATORS, "rocksdb.num.iterators"},
|
||||
{NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
|
||||
{NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
|
||||
{NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
|
||||
{NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
|
||||
{NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
|
||||
{SEQUENCE_NUMBER, "rocksdb.sequence.number"},
|
||||
{BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
|
||||
{BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
|
||||
{NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
|
||||
{GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
|
||||
{BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
|
||||
{BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
|
||||
{WAL_FILE_SYNCED, "rocksdb.wal.synced"},
|
||||
{WAL_FILE_BYTES, "rocksdb.wal.bytes"},
|
||||
{WRITE_DONE_BY_SELF, "rocksdb.write.self"},
|
||||
{WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
|
||||
{WRITE_TIMEDOUT, "rocksdb.write.timedout"},
|
||||
{WRITE_WITH_WAL, "rocksdb.write.wal"},
|
||||
{FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
|
||||
{COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
|
||||
{COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
|
||||
{NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
|
||||
"rocksdb.number.direct.load.table.properties"},
|
||||
{NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
|
||||
{NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
|
||||
{NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
|
||||
{NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
|
||||
};
|
||||
|
||||
/**
|
||||
* Keep adding histogram's here.
|
||||
* Any histogram whould have value less than HISTOGRAM_ENUM_MAX
|
||||
* Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
|
||||
* Add a string representation in HistogramsNameMap below
|
||||
* And increment HISTOGRAM_ENUM_MAX
|
||||
*/
|
||||
enum Histograms : uint32_t {
|
||||
DB_GET = 0,
|
||||
DB_WRITE,
|
||||
COMPACTION_TIME,
|
||||
TABLE_SYNC_MICROS,
|
||||
COMPACTION_OUTFILE_SYNC_MICROS,
|
||||
WAL_FILE_SYNC_MICROS,
|
||||
MANIFEST_FILE_SYNC_MICROS,
|
||||
// TIME SPENT IN IO DURING TABLE OPEN
|
||||
TABLE_OPEN_IO_MICROS,
|
||||
DB_MULTIGET,
|
||||
READ_BLOCK_COMPACTION_MICROS,
|
||||
READ_BLOCK_GET_MICROS,
|
||||
WRITE_RAW_BLOCK_MICROS,
|
||||
|
||||
STALL_L0_SLOWDOWN_COUNT,
|
||||
STALL_MEMTABLE_COMPACTION_COUNT,
|
||||
STALL_L0_NUM_FILES_COUNT,
|
||||
HARD_RATE_LIMIT_DELAY_COUNT,
|
||||
SOFT_RATE_LIMIT_DELAY_COUNT,
|
||||
NUM_FILES_IN_SINGLE_COMPACTION,
|
||||
HISTOGRAM_ENUM_MAX,
|
||||
};
|
||||
|
||||
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
|
||||
{ DB_GET, "rocksdb.db.get.micros" },
|
||||
{ DB_WRITE, "rocksdb.db.write.micros" },
|
||||
{ COMPACTION_TIME, "rocksdb.compaction.times.micros" },
|
||||
{ TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" },
|
||||
{ COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" },
|
||||
{ WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" },
|
||||
{ MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" },
|
||||
{ TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" },
|
||||
{ DB_MULTIGET, "rocksdb.db.multiget.micros" },
|
||||
{ READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" },
|
||||
{ READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" },
|
||||
{ WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" },
|
||||
{ STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
|
||||
{ STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
|
||||
{ STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
|
||||
{ HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
|
||||
{ SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
|
||||
{ NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
|
||||
};
|
||||
|
||||
struct HistogramData {
|
||||
double median;
|
||||
double percentile95;
|
||||
double percentile99;
|
||||
double average;
|
||||
double standard_deviation;
|
||||
};
|
||||
|
||||
// Analyze the performance of a db
|
||||
class Statistics {
|
||||
public:
|
||||
virtual ~Statistics() {}
|
||||
|
||||
virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
|
||||
virtual void histogramData(uint32_t type,
|
||||
HistogramData* const data) const = 0;
|
||||
|
||||
virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
|
||||
virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
|
||||
virtual void measureTime(uint32_t histogramType, uint64_t time) = 0;
|
||||
|
||||
// String representation of the statistic object.
|
||||
virtual std::string ToString() const = 0;
|
||||
|
||||
// Override this function to disable particular histogram collection
|
||||
virtual bool HistEnabledForType(uint32_t type) const {
|
||||
return type < HISTOGRAM_ENUM_MAX;
|
||||
}
|
||||
};
|
||||
|
||||
// Create a concrete DBStatistics object
|
||||
std::shared_ptr<Statistics> CreateDBStatistics();
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
|
||||
154
include/rocksdb/status.h
Normal file
154
include/rocksdb/status.h
Normal file
@@ -0,0 +1,154 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A Status encapsulates the result of an operation. It may indicate success,
|
||||
// or it may indicate an error with an associated error message.
|
||||
//
|
||||
// Multiple threads can invoke const methods on a Status without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same Status must use
|
||||
// external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_
|
||||
|
||||
#include <string>
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Status {
|
||||
public:
|
||||
// Create a success status.
|
||||
Status() : code_(kOk), state_(nullptr) { }
|
||||
~Status() { delete[] state_; }
|
||||
|
||||
// Copy the specified status.
|
||||
Status(const Status& s);
|
||||
void operator=(const Status& s);
|
||||
|
||||
// Return a success status.
|
||||
static Status OK() { return Status(); }
|
||||
|
||||
// Return error status of an appropriate type.
|
||||
static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kNotFound, msg, msg2);
|
||||
}
|
||||
// Fast path for not found without malloc;
|
||||
static Status NotFound() {
|
||||
return Status(kNotFound);
|
||||
}
|
||||
static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kCorruption, msg, msg2);
|
||||
}
|
||||
static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kNotSupported, msg, msg2);
|
||||
}
|
||||
static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kInvalidArgument, msg, msg2);
|
||||
}
|
||||
static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kIOError, msg, msg2);
|
||||
}
|
||||
static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kMergeInProgress, msg, msg2);
|
||||
}
|
||||
static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kIncomplete, msg, msg2);
|
||||
}
|
||||
static Status ShutdownInProgress(const Slice& msg,
|
||||
const Slice& msg2 = Slice()) {
|
||||
return Status(kShutdownInProgress, msg, msg2);
|
||||
}
|
||||
static Status TimedOut() {
|
||||
return Status(kTimedOut);
|
||||
}
|
||||
static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
|
||||
return Status(kTimedOut, msg, msg2);
|
||||
}
|
||||
|
||||
// Returns true iff the status indicates success.
|
||||
bool ok() const { return code() == kOk; }
|
||||
|
||||
// Returns true iff the status indicates a NotFound error.
|
||||
bool IsNotFound() const { return code() == kNotFound; }
|
||||
|
||||
// Returns true iff the status indicates a Corruption error.
|
||||
bool IsCorruption() const { return code() == kCorruption; }
|
||||
|
||||
// Returns true iff the status indicates a NotSupported error.
|
||||
bool IsNotSupported() const { return code() == kNotSupported; }
|
||||
|
||||
// Returns true iff the status indicates an InvalidArgument error.
|
||||
bool IsInvalidArgument() const { return code() == kInvalidArgument; }
|
||||
|
||||
// Returns true iff the status indicates an IOError.
|
||||
bool IsIOError() const { return code() == kIOError; }
|
||||
|
||||
// Returns true iff the status indicates an MergeInProgress.
|
||||
bool IsMergeInProgress() const { return code() == kMergeInProgress; }
|
||||
|
||||
// Returns true iff the status indicates Incomplete
|
||||
bool IsIncomplete() const { return code() == kIncomplete; }
|
||||
|
||||
// Returns true iff the status indicates Incomplete
|
||||
bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }
|
||||
|
||||
bool IsTimedOut() const { return code() == kTimedOut; }
|
||||
|
||||
// Return a string representation of this status suitable for printing.
|
||||
// Returns the string "OK" for success.
|
||||
std::string ToString() const;
|
||||
|
||||
enum Code {
|
||||
kOk = 0,
|
||||
kNotFound = 1,
|
||||
kCorruption = 2,
|
||||
kNotSupported = 3,
|
||||
kInvalidArgument = 4,
|
||||
kIOError = 5,
|
||||
kMergeInProgress = 6,
|
||||
kIncomplete = 7,
|
||||
kShutdownInProgress = 8,
|
||||
kTimedOut = 9
|
||||
};
|
||||
|
||||
Code code() const {
|
||||
return code_;
|
||||
}
|
||||
private:
|
||||
// A nullptr state_ (which is always the case for OK) means the message
|
||||
// is empty.
|
||||
// of the following form:
|
||||
// state_[0..3] == length of message
|
||||
// state_[4..] == message
|
||||
Code code_;
|
||||
const char* state_;
|
||||
|
||||
explicit Status(Code code) : code_(code), state_(nullptr) { }
|
||||
Status(Code code, const Slice& msg, const Slice& msg2);
|
||||
static const char* CopyState(const char* s);
|
||||
};
|
||||
|
||||
inline Status::Status(const Status& s) {
|
||||
code_ = s.code_;
|
||||
state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
|
||||
}
|
||||
inline void Status::operator=(const Status& s) {
|
||||
// The following condition catches both aliasing (when this == &s),
|
||||
// and the common case where both s and *this are ok.
|
||||
code_ = s.code_;
|
||||
if (state_ != s.state_) {
|
||||
delete[] state_;
|
||||
state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_STATUS_H_
|
||||
270
include/rocksdb/table.h
Normal file
270
include/rocksdb/table.h
Normal file
@@ -0,0 +1,270 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Currently we support two types of tables: plain table and block-based table.
|
||||
// 1. Block-based table: this is the default table type that we inherited from
|
||||
// LevelDB, which was designed for storing data in hard disk or flash
|
||||
// device.
|
||||
// 2. Plain table: it is one of RocksDB's SST file format optimized
|
||||
// for low query latency on pure-memory or really low-latency media.
|
||||
//
|
||||
// A tutorial of rocksdb table formats is available here:
|
||||
// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
|
||||
//
|
||||
// Example code is also available
|
||||
// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
|
||||
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// -- Block-based Table
|
||||
class FlushBlockPolicyFactory;
|
||||
class RandomAccessFile;
|
||||
class TableBuilder;
|
||||
class TableReader;
|
||||
class WritableFile;
|
||||
struct EnvOptions;
|
||||
struct Options;
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
enum ChecksumType : char {
|
||||
kNoChecksum = 0x0, // not yet supported. Will fail
|
||||
kCRC32c = 0x1,
|
||||
kxxHash = 0x2,
|
||||
};
|
||||
|
||||
// For advanced user only
|
||||
struct BlockBasedTableOptions {
|
||||
// @flush_block_policy_factory creates the instances of flush block policy.
|
||||
// which provides a configurable way to determine when to flush a block in
|
||||
// the block based tables. If not set, table builder will use the default
|
||||
// block flush policy, which cut blocks by block size (please refer to
|
||||
// `FlushBlockBySizePolicy`).
|
||||
std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
|
||||
|
||||
// TODO(kailiu) Temporarily disable this feature by making the default value
|
||||
// to be false.
|
||||
//
|
||||
// Indicating if we'd put index/filter blocks to the block cache.
|
||||
// If not specified, each "table reader" object will pre-load index/filter
|
||||
// block during table initialization.
|
||||
bool cache_index_and_filter_blocks = false;
|
||||
|
||||
// The index type that will be used for this table.
|
||||
enum IndexType : char {
|
||||
// A space efficient index block that is optimized for
|
||||
// binary-search-based index.
|
||||
kBinarySearch,
|
||||
|
||||
// The hash index, if enabled, will do the hash lookup when
|
||||
// `Options.prefix_extractor` is provided.
|
||||
kHashSearch,
|
||||
};
|
||||
|
||||
IndexType index_type = kBinarySearch;
|
||||
|
||||
// Influence the behavior when kHashSearch is used.
|
||||
// if false, stores a precise prefix to block range mapping
|
||||
// if true, does not store prefix and allows prefix hash collision
|
||||
// (less memory consumption)
|
||||
bool hash_index_allow_collision = true;
|
||||
|
||||
// Use the specified checksum type. Newly created table files will be
|
||||
// protected with this checksum type. Old table files will still be readable,
|
||||
// even though they have different checksum type.
|
||||
ChecksumType checksum = kCRC32c;
|
||||
};
|
||||
|
||||
// Table Properties that are specific to block-based table properties.
|
||||
struct BlockBasedTablePropertyNames {
|
||||
// value of this propertis is a fixed int32 number.
|
||||
static const std::string kIndexType;
|
||||
};
|
||||
|
||||
// Create default block based table factory.
|
||||
extern TableFactory* NewBlockBasedTableFactory(
|
||||
const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
enum EncodingType : char {
|
||||
// Always write full keys without any special encoding.
|
||||
kPlain,
|
||||
// Find opportunity to write the same prefix once for multiple rows.
|
||||
// In some cases, when a key follows a previous key with the same prefix,
|
||||
// instead of writing out the full key, it just writes out the size of the
|
||||
// shared prefix, as well as other bytes, to save some bytes.
|
||||
//
|
||||
// When using this option, the user is required to use the same prefix
|
||||
// extractor to make sure the same prefix will be extracted from the same key.
|
||||
// The Name() value of the prefix extractor will be stored in the file. When
|
||||
// reopening the file, the name of the options.prefix_extractor given will be
|
||||
// bitwise compared to the prefix extractors stored in the file. An error
|
||||
// will be returned if the two don't match.
|
||||
kPrefix,
|
||||
};
|
||||
|
||||
// Table Properties that are specific to plain table properties.
|
||||
struct PlainTablePropertyNames {
|
||||
static const std::string kPrefixExtractorName;
|
||||
static const std::string kEncodingType;
|
||||
static const std::string kBloomVersion;
|
||||
static const std::string kNumBloomBlocks;
|
||||
};
|
||||
|
||||
const uint32_t kPlainTableVariableLength = 0;
|
||||
|
||||
struct PlainTableOptions {
|
||||
// @user_key_len: plain table has optimization for fix-sized keys, which can be
|
||||
// specified via user_key_len. Alternatively, you can pass
|
||||
// `kPlainTableVariableLength` if your keys have variable
|
||||
// lengths.
|
||||
uint32_t user_key_len = kPlainTableVariableLength;
|
||||
|
||||
// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You
|
||||
// may disable it by passing a zero.
|
||||
int bloom_bits_per_key = 10;
|
||||
|
||||
// @hash_table_ratio: the desired utilization of the hash table used for prefix
|
||||
// hashing. hash_table_ratio = number of prefixes / #buckets
|
||||
// in the hash table
|
||||
double hash_table_ratio = 0.75;
|
||||
|
||||
// @index_sparseness: inside each prefix, need to build one index record for how
|
||||
// many keys for binary search inside each hash bucket.
|
||||
// For encoding type kPrefix, the value will be used when
|
||||
// writing to determine an interval to rewrite the full key.
|
||||
// It will also be used as a suggestion and satisfied when
|
||||
// possible.
|
||||
size_t index_sparseness = 16;
|
||||
|
||||
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
|
||||
// Otherwise from huge page TLB. The user needs to reserve
|
||||
// huge pages for it to be allocated, like:
|
||||
// sysctl -w vm.nr_hugepages=20
|
||||
// See linux doc Documentation/vm/hugetlbpage.txt
|
||||
size_t huge_page_tlb_size = 0;
|
||||
|
||||
// @encoding_type: how to encode the keys. See enum EncodingType above for
|
||||
// the choices. The value will determine how to encode keys
|
||||
// when writing to a new SST file. This value will be stored
|
||||
// inside the SST file which will be used when reading from the
|
||||
// file, which makes it possible for users to choose different
|
||||
// encoding type when reopening a DB. Files with different
|
||||
// encoding types can co-exist in the same DB and can be read.
|
||||
EncodingType encoding_type = kPlain;
|
||||
|
||||
// @full_scan_mode: mode for reading the whole file one record by one without
|
||||
// using the index.
|
||||
bool full_scan_mode = false;
|
||||
|
||||
// @store_index_in_file: compute plain table index and bloom filter during
|
||||
// file building and store it in file. When reading
|
||||
// file, index will be mmaped instead of recomputation.
|
||||
bool store_index_in_file = false;
|
||||
};
|
||||
|
||||
// -- Plain Table with prefix-only seek
|
||||
// For this factory, you need to set Options.prefix_extrator properly to make it
|
||||
// work. Look-up will starts with prefix hash lookup for key prefix. Inside the
|
||||
// hash bucket found, a binary search is executed for hash conflicts. Finally,
|
||||
// a linear search is used.
|
||||
|
||||
extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options =
|
||||
PlainTableOptions());
|
||||
|
||||
struct CuckooTablePropertyNames {
|
||||
static const std::string kEmptyKey;
|
||||
static const std::string kValueLength;
|
||||
static const std::string kNumHashTable;
|
||||
static const std::string kMaxNumBuckets;
|
||||
static const std::string kIsLastLevel;
|
||||
};
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
// A base class for table factories.
|
||||
class TableFactory {
|
||||
public:
|
||||
virtual ~TableFactory() {}
|
||||
|
||||
// The type of the table.
|
||||
//
|
||||
// The client of this package should switch to a new name whenever
|
||||
// the table format implementation changes.
|
||||
//
|
||||
// Names starting with "rocksdb." are reserved and should not be used
|
||||
// by any clients of this package.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// Returns a Table object table that can fetch data from file specified
|
||||
// in parameter file. It's the caller's responsibility to make sure
|
||||
// file is in the correct format.
|
||||
//
|
||||
// NewTableReader() is called in two places:
|
||||
// (1) TableCache::FindTable() calls the function when table cache miss
|
||||
// and cache the table object returned.
|
||||
// (1) SstFileReader (for SST Dump) opens the table and dump the table
|
||||
// contents using the interator of the table.
|
||||
// options and soptions are options. options is the general options.
|
||||
// Multiple configured can be accessed from there, including and not
|
||||
// limited to block cache and key comparators.
|
||||
// file is a file handler to handle the file for the table
|
||||
// file_size is the physical file size of the file
|
||||
// table_reader is the output table reader
|
||||
virtual Status NewTableReader(
|
||||
const Options& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader) const = 0;
|
||||
|
||||
// Return a table builder to write to a file for this table type.
|
||||
//
|
||||
// It is called in several places:
|
||||
// (1) When flushing memtable to a level-0 output file, it creates a table
|
||||
// builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
|
||||
// (2) During compaction, it gets the builder for writing compaction output
|
||||
// files in DBImpl::OpenCompactionOutputFile().
|
||||
// (3) When recovering from transaction logs, it creates a table builder to
|
||||
// write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
|
||||
// by calling BuildTable())
|
||||
// (4) When running Repairer, it creates a table builder to convert logs to
|
||||
// SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
|
||||
//
|
||||
// options is the general options. Multiple configured can be acceseed from
|
||||
// there, including and not limited to compression options.
|
||||
// file is a handle of a writable file. It is the caller's responsibility to
|
||||
// keep the file open and close the file after closing the table builder.
|
||||
// compression_type is the compression type to use in this table.
|
||||
virtual TableBuilder* NewTableBuilder(
|
||||
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||
WritableFile* file, CompressionType compression_type) const = 0;
|
||||
};
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
// Create a special table factory that can open both of block based table format
|
||||
// and plain table, based on setting inside the SST files. It should be used to
|
||||
// convert a DB from one table format to another.
|
||||
// @table_factory_to_write: the table factory used when writing to new files.
|
||||
// @block_based_table_factory: block based table factory to use. If NULL, use
|
||||
// a default one.
|
||||
// @plain_table_factory: plain table factory to use. If NULL, use a default one.
|
||||
extern TableFactory* NewAdaptiveTableFactory(
|
||||
std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
|
||||
std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
|
||||
std::shared_ptr<TableFactory> plain_table_factory = nullptr);
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
} // namespace rocksdb
|
||||
127
include/rocksdb/table_properties.h
Normal file
127
include/rocksdb/table_properties.h
Normal file
@@ -0,0 +1,127 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// -- Table Properties
|
||||
// Other than basic table properties, each table may also have the user
|
||||
// collected properties.
|
||||
// The value of the user-collected properties are encoded as raw bytes --
|
||||
// users have to interprete these values by themselves.
|
||||
// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do
|
||||
// something similar to:
|
||||
//
|
||||
// UserCollectedProperties props = ...;
|
||||
// for (auto pos = props.lower_bound(prefix);
|
||||
// pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0;
|
||||
// ++pos) {
|
||||
// ...
|
||||
// }
|
||||
typedef std::map<const std::string, std::string> UserCollectedProperties;
|
||||
|
||||
// TableProperties contains a bunch of read-only properties of its associated
|
||||
// table.
|
||||
struct TableProperties {
|
||||
public:
|
||||
// the total size of all data blocks.
|
||||
uint64_t data_size = 0;
|
||||
// the size of index block.
|
||||
uint64_t index_size = 0;
|
||||
// the size of filter block.
|
||||
uint64_t filter_size = 0;
|
||||
// total raw key size
|
||||
uint64_t raw_key_size = 0;
|
||||
// total raw value size
|
||||
uint64_t raw_value_size = 0;
|
||||
// the number of blocks in this table
|
||||
uint64_t num_data_blocks = 0;
|
||||
// the number of entries in this table
|
||||
uint64_t num_entries = 0;
|
||||
// format version, reserved for backward compatibility
|
||||
uint64_t format_version = 0;
|
||||
// If 0, key is variable length. Otherwise number of bytes for each key.
|
||||
uint64_t fixed_key_len = 0;
|
||||
|
||||
// The name of the filter policy used in this table.
|
||||
// If no filter policy is used, `filter_policy_name` will be an empty string.
|
||||
std::string filter_policy_name;
|
||||
|
||||
// user collected properties
|
||||
UserCollectedProperties user_collected_properties;
|
||||
|
||||
// convert this object to a human readable form
|
||||
// @prop_delim: delimiter for each property.
|
||||
std::string ToString(const std::string& prop_delim = "; ",
|
||||
const std::string& kv_delim = "=") const;
|
||||
};
|
||||
|
||||
// table properties' human-readable names in the property block.
|
||||
struct TablePropertiesNames {
|
||||
static const std::string kDataSize;
|
||||
static const std::string kIndexSize;
|
||||
static const std::string kFilterSize;
|
||||
static const std::string kRawKeySize;
|
||||
static const std::string kRawValueSize;
|
||||
static const std::string kNumDataBlocks;
|
||||
static const std::string kNumEntries;
|
||||
static const std::string kFormatVersion;
|
||||
static const std::string kFixedKeyLen;
|
||||
static const std::string kFilterPolicy;
|
||||
};
|
||||
|
||||
extern const std::string kPropertiesBlock;
|
||||
|
||||
// `TablePropertiesCollector` provides the mechanism for users to collect
|
||||
// their own interested properties. This class is essentially a collection
|
||||
// of callback functions that will be invoked during table building.
|
||||
// It is construced with TablePropertiesCollectorFactory. The methods don't
|
||||
// need to be thread-safe, as we will create exactly one
|
||||
// TablePropertiesCollector object per table and then call it sequentially
|
||||
class TablePropertiesCollector {
|
||||
public:
|
||||
virtual ~TablePropertiesCollector() {}
|
||||
|
||||
// Add() will be called when a new key/value pair is inserted into the table.
|
||||
// @params key the original key that is inserted into the table.
|
||||
// @params value the original value that is inserted into the table.
|
||||
virtual Status Add(const Slice& key, const Slice& value) = 0;
|
||||
|
||||
// Finish() will be called when a table has already been built and is ready
|
||||
// for writing the properties block.
|
||||
// @params properties User will add their collected statistics to
|
||||
// `properties`.
|
||||
virtual Status Finish(UserCollectedProperties* properties) = 0;
|
||||
|
||||
// Return the human-readable properties, where the key is property name and
|
||||
// the value is the human-readable form of value.
|
||||
virtual UserCollectedProperties GetReadableProperties() const = 0;
|
||||
|
||||
// The name of the properties collector can be used for debugging purpose.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// Constructs TablePropertiesCollector. Internals create a new
|
||||
// TablePropertiesCollector for each new table
|
||||
class TablePropertiesCollectorFactory {
|
||||
public:
|
||||
virtual ~TablePropertiesCollectorFactory() {}
|
||||
// has to be thread-safe
|
||||
virtual TablePropertiesCollector* CreateTablePropertiesCollector() = 0;
|
||||
|
||||
// The name of the properties collector can be used for debugging purpose.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// Extra properties
|
||||
// Below is a list of non-basic properties that are collected by database
|
||||
// itself. Especially some properties regarding to the internal keys (which
|
||||
// is unknown to `table`).
|
||||
extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
|
||||
|
||||
} // namespace rocksdb
|
||||
104
include/rocksdb/transaction_log.h
Normal file
104
include/rocksdb/transaction_log.h
Normal file
@@ -0,0 +1,104 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
|
||||
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/write_batch.h"
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class LogFile;
|
||||
typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
|
||||
|
||||
enum WalFileType {
|
||||
/* Indicates that WAL file is in archive directory. WAL files are moved from
|
||||
* the main db directory to archive directory once they are not live and stay
|
||||
* there until cleaned up. Files are cleaned depending on archive size
|
||||
* (Options::WAL_size_limit_MB) and time since last cleaning
|
||||
* (Options::WAL_ttl_seconds).
|
||||
*/
|
||||
kArchivedLogFile = 0,
|
||||
|
||||
/* Indicates that WAL file is live and resides in the main db directory */
|
||||
kAliveLogFile = 1
|
||||
} ;
|
||||
|
||||
class LogFile {
|
||||
public:
|
||||
LogFile() {}
|
||||
virtual ~LogFile() {}
|
||||
|
||||
// Returns log file's pathname relative to the main db dir
|
||||
// Eg. For a live-log-file = /000003.log
|
||||
// For an archived-log-file = /archive/000003.log
|
||||
virtual std::string PathName() const = 0;
|
||||
|
||||
|
||||
// Primary identifier for log file.
|
||||
// This is directly proportional to creation time of the log file
|
||||
virtual uint64_t LogNumber() const = 0;
|
||||
|
||||
// Log file can be either alive or archived
|
||||
virtual WalFileType Type() const = 0;
|
||||
|
||||
// Starting sequence number of writebatch written in this log file
|
||||
virtual SequenceNumber StartSequence() const = 0;
|
||||
|
||||
// Size of log file on disk in Bytes
|
||||
virtual uint64_t SizeFileBytes() const = 0;
|
||||
};
|
||||
|
||||
struct BatchResult {
|
||||
SequenceNumber sequence = 0;
|
||||
std::unique_ptr<WriteBatch> writeBatchPtr;
|
||||
};
|
||||
|
||||
// A TransactionLogIterator is used to iterate over the transactions in a db.
|
||||
// One run of the iterator is continuous, i.e. the iterator will stop at the
|
||||
// beginning of any gap in sequences
|
||||
class TransactionLogIterator {
|
||||
public:
|
||||
TransactionLogIterator() {}
|
||||
virtual ~TransactionLogIterator() {}
|
||||
|
||||
// An iterator is either positioned at a WriteBatch or not valid.
|
||||
// This method returns true if the iterator is valid.
|
||||
// Can read data from a valid iterator.
|
||||
virtual bool Valid() = 0;
|
||||
|
||||
// Moves the iterator to the next WriteBatch.
|
||||
// REQUIRES: Valid() to be true.
|
||||
virtual void Next() = 0;
|
||||
|
||||
// Returns ok if the iterator is valid.
|
||||
// Returns the Error when something has gone wrong.
|
||||
virtual Status status() = 0;
|
||||
|
||||
// If valid return's the current write_batch and the sequence number of the
|
||||
// earliest transaction contained in the batch.
|
||||
// ONLY use if Valid() is true and status() is OK.
|
||||
virtual BatchResult GetBatch() = 0;
|
||||
|
||||
// The read options for TransactionLogIterator.
|
||||
struct ReadOptions {
|
||||
// If true, all data read from underlying storage will be
|
||||
// verified against corresponding checksums.
|
||||
// Default: true
|
||||
bool verify_checksums_;
|
||||
|
||||
ReadOptions() : verify_checksums_(true) {}
|
||||
|
||||
explicit ReadOptions(bool verify_checksums)
|
||||
: verify_checksums_(verify_checksums) {}
|
||||
};
|
||||
};
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
|
||||
20
include/rocksdb/types.h
Normal file
20
include/rocksdb/types.h
Normal file
@@ -0,0 +1,20 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Define all public custom types here.
|
||||
|
||||
// Represents a sequence number in a WAL file.
|
||||
typedef uint64_t SequenceNumber;
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_TYPES_H_
|
||||
84
include/rocksdb/universal_compaction.h
Normal file
84
include/rocksdb/universal_compaction.h
Normal file
@@ -0,0 +1,84 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
|
||||
#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <climits>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
//
|
||||
// Algorithm used to make a compaction request stop picking new files
|
||||
// into a single compaction run
|
||||
//
|
||||
enum CompactionStopStyle {
|
||||
kCompactionStopStyleSimilarSize, // pick files of similar size
|
||||
kCompactionStopStyleTotalSize // total size of picked files > next file
|
||||
};
|
||||
|
||||
class CompactionOptionsUniversal {
|
||||
public:
|
||||
|
||||
// Percentage flexibilty while comparing file size. If the candidate file(s)
|
||||
// size is 1% smaller than the next file's size, then include next file into
|
||||
// this candidate set. // Default: 1
|
||||
unsigned int size_ratio;
|
||||
|
||||
// The minimum number of files in a single compaction run. Default: 2
|
||||
unsigned int min_merge_width;
|
||||
|
||||
// The maximum number of files in a single compaction run. Default: UINT_MAX
|
||||
unsigned int max_merge_width;
|
||||
|
||||
// The size amplification is defined as the amount (in percentage) of
|
||||
// additional storage needed to store a single byte of data in the database.
|
||||
// For example, a size amplification of 2% means that a database that
|
||||
// contains 100 bytes of user-data may occupy upto 102 bytes of
|
||||
// physical storage. By this definition, a fully compacted database has
|
||||
// a size amplification of 0%. Rocksdb uses the following heuristic
|
||||
// to calculate size amplification: it assumes that all files excluding
|
||||
// the earliest file contribute to the size amplification.
|
||||
// Default: 200, which means that a 100 byte database could require upto
|
||||
// 300 bytes of storage.
|
||||
unsigned int max_size_amplification_percent;
|
||||
|
||||
// If this option is set to be -1 (the default value), all the output files
|
||||
// will follow compression type specified.
|
||||
//
|
||||
// If this option is not negative, we will try to make sure compressed
|
||||
// size is just above this value. In normal cases, at least this percentage
|
||||
// of data will be compressed.
|
||||
// When we are compacting to a new file, here is the criteria whether
|
||||
// it needs to be compressed: assuming here are the list of files sorted
|
||||
// by generation time:
|
||||
// A1...An B1...Bm C1...Ct
|
||||
// where A1 is the newest and Ct is the oldest, and we are going to compact
|
||||
// B1...Bm, we calculate the total size of all the files as total_size, as
|
||||
// well as the total size of C1...Ct as total_C, the compaction output file
|
||||
// will be compressed iff
|
||||
// total_C / total_size < this percentage
|
||||
// Default: -1
|
||||
int compression_size_percent;
|
||||
|
||||
// The algorithm used to stop picking files into a single compaction run
|
||||
// Default: kCompactionStopStyleTotalSize
|
||||
CompactionStopStyle stop_style;
|
||||
|
||||
// Default set of parameters
|
||||
CompactionOptionsUniversal()
|
||||
: size_ratio(1),
|
||||
min_merge_width(2),
|
||||
max_merge_width(UINT_MAX),
|
||||
max_size_amplification_percent(200),
|
||||
compression_size_percent(-1),
|
||||
stop_style(kCompactionStopStyleTotalSize) {}
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
|
||||
252
include/rocksdb/utilities/backupable_db.h
Normal file
252
include/rocksdb/utilities/backupable_db.h
Normal file
@@ -0,0 +1,252 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
#include "rocksdb/utilities/stackable_db.h"
|
||||
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct BackupableDBOptions {
|
||||
// Where to keep the backup files. Has to be different than dbname_
|
||||
// Best to set this to dbname_ + "/backups"
|
||||
// Required
|
||||
std::string backup_dir;
|
||||
|
||||
// Backup Env object. It will be used for backup file I/O. If it's
|
||||
// nullptr, backups will be written out using DBs Env. If it's
|
||||
// non-nullptr, backup's I/O will be performed using this object.
|
||||
// If you want to have backups on HDFS, use HDFS Env here!
|
||||
// Default: nullptr
|
||||
Env* backup_env;
|
||||
|
||||
// If share_table_files == true, backup will assume that table files with
|
||||
// same name have the same contents. This enables incremental backups and
|
||||
// avoids unnecessary data copies.
|
||||
// If share_table_files == false, each backup will be on its own and will
|
||||
// not share any data with other backups.
|
||||
// default: true
|
||||
bool share_table_files;
|
||||
|
||||
// Backup info and error messages will be written to info_log
|
||||
// if non-nullptr.
|
||||
// Default: nullptr
|
||||
Logger* info_log;
|
||||
|
||||
// If sync == true, we can guarantee you'll get consistent backup even
|
||||
// on a machine crash/reboot. Backup process is slower with sync enabled.
|
||||
// If sync == false, we don't guarantee anything on machine reboot. However,
|
||||
// chances are some of the backups are consistent.
|
||||
// Default: true
|
||||
bool sync;
|
||||
|
||||
// If true, it will delete whatever backups there are already
|
||||
// Default: false
|
||||
bool destroy_old_data;
|
||||
|
||||
// If false, we won't backup log files. This option can be useful for backing
|
||||
// up in-memory databases where log file are persisted, but table files are in
|
||||
// memory.
|
||||
// Default: true
|
||||
bool backup_log_files;
|
||||
|
||||
// Max bytes that can be transferred in a second during backup.
|
||||
// If 0, go as fast as you can
|
||||
// Default: 0
|
||||
uint64_t backup_rate_limit;
|
||||
|
||||
// Max bytes that can be transferred in a second during restore.
|
||||
// If 0, go as fast as you can
|
||||
// Default: 0
|
||||
uint64_t restore_rate_limit;
|
||||
|
||||
// Only used if share_table_files is set to true. If true, will consider that
|
||||
// backups can come from different databases, hence a sst is not uniquely
|
||||
// identifed by its name, but by the triple (file name, crc32, file length)
|
||||
// Default: false
|
||||
// Note: this is an experimental option, and you'll need to set it manually
|
||||
// *turn it on only if you know what you're doing*
|
||||
bool share_files_with_checksum;
|
||||
|
||||
void Dump(Logger* logger) const;
|
||||
|
||||
explicit BackupableDBOptions(const std::string& _backup_dir,
|
||||
Env* _backup_env = nullptr,
|
||||
bool _share_table_files = true,
|
||||
Logger* _info_log = nullptr, bool _sync = true,
|
||||
bool _destroy_old_data = false,
|
||||
bool _backup_log_files = true,
|
||||
uint64_t _backup_rate_limit = 0,
|
||||
uint64_t _restore_rate_limit = 0)
|
||||
: backup_dir(_backup_dir),
|
||||
backup_env(_backup_env),
|
||||
share_table_files(_share_table_files),
|
||||
info_log(_info_log),
|
||||
sync(_sync),
|
||||
destroy_old_data(_destroy_old_data),
|
||||
backup_log_files(_backup_log_files),
|
||||
backup_rate_limit(_backup_rate_limit),
|
||||
restore_rate_limit(_restore_rate_limit),
|
||||
share_files_with_checksum(false) {
|
||||
assert(share_table_files || !share_files_with_checksum);
|
||||
}
|
||||
};
|
||||
|
||||
struct RestoreOptions {
|
||||
// If true, restore won't overwrite the existing log files in wal_dir. It will
|
||||
// also move all log files from archive directory to wal_dir. Use this option
|
||||
// in combination with BackupableDBOptions::backup_log_files = false for
|
||||
// persisting in-memory databases.
|
||||
// Default: false
|
||||
bool keep_log_files;
|
||||
|
||||
explicit RestoreOptions(bool _keep_log_files = false)
|
||||
: keep_log_files(_keep_log_files) {}
|
||||
};
|
||||
|
||||
typedef uint32_t BackupID;
|
||||
|
||||
struct BackupInfo {
|
||||
BackupID backup_id;
|
||||
int64_t timestamp;
|
||||
uint64_t size;
|
||||
|
||||
BackupInfo() {}
|
||||
BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size)
|
||||
: backup_id(_backup_id), timestamp(_timestamp), size(_size) {}
|
||||
};
|
||||
|
||||
class BackupEngineReadOnly {
|
||||
public:
|
||||
virtual ~BackupEngineReadOnly() {}
|
||||
|
||||
static BackupEngineReadOnly* NewReadOnlyBackupEngine(
|
||||
Env* db_env, const BackupableDBOptions& options);
|
||||
|
||||
// You can GetBackupInfo safely, even with other BackupEngine performing
|
||||
// backups on the same directory
|
||||
virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
|
||||
|
||||
// Restoring DB from backup is NOT safe when there is another BackupEngine
|
||||
// running that might call DeleteBackup() or PurgeOldBackups(). It is caller's
|
||||
// responsibility to synchronize the operation, i.e. don't delete the backup
|
||||
// when you're restoring from it
|
||||
virtual Status RestoreDBFromBackup(
|
||||
BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
|
||||
const RestoreOptions& restore_options = RestoreOptions()) = 0;
|
||||
virtual Status RestoreDBFromLatestBackup(
|
||||
const std::string& db_dir, const std::string& wal_dir,
|
||||
const RestoreOptions& restore_options = RestoreOptions()) = 0;
|
||||
};
|
||||
|
||||
// Please see the documentation in BackupableDB and RestoreBackupableDB
|
||||
class BackupEngine {
|
||||
public:
|
||||
virtual ~BackupEngine() {}
|
||||
|
||||
static BackupEngine* NewBackupEngine(Env* db_env,
|
||||
const BackupableDBOptions& options);
|
||||
|
||||
virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0;
|
||||
virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
|
||||
virtual Status DeleteBackup(BackupID backup_id) = 0;
|
||||
virtual void StopBackup() = 0;
|
||||
|
||||
virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
|
||||
virtual Status RestoreDBFromBackup(
|
||||
BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
|
||||
const RestoreOptions& restore_options = RestoreOptions()) = 0;
|
||||
virtual Status RestoreDBFromLatestBackup(
|
||||
const std::string& db_dir, const std::string& wal_dir,
|
||||
const RestoreOptions& restore_options = RestoreOptions()) = 0;
|
||||
};
|
||||
|
||||
// Stack your DB with BackupableDB to be able to backup the DB
|
||||
class BackupableDB : public StackableDB {
|
||||
public:
|
||||
// BackupableDBOptions have to be the same as the ones used in a previous
|
||||
// incarnation of the DB
|
||||
//
|
||||
// BackupableDB ownes the pointer `DB* db` now. You should not delete it or
|
||||
// use it after the invocation of BackupableDB
|
||||
BackupableDB(DB* db, const BackupableDBOptions& options);
|
||||
virtual ~BackupableDB();
|
||||
|
||||
// Captures the state of the database in the latest backup
|
||||
// NOT a thread safe call
|
||||
Status CreateNewBackup(bool flush_before_backup = false);
|
||||
// Returns info about backups in backup_info
|
||||
void GetBackupInfo(std::vector<BackupInfo>* backup_info);
|
||||
// deletes old backups, keeping latest num_backups_to_keep alive
|
||||
Status PurgeOldBackups(uint32_t num_backups_to_keep);
|
||||
// deletes a specific backup
|
||||
Status DeleteBackup(BackupID backup_id);
|
||||
// Call this from another thread if you want to stop the backup
|
||||
// that is currently happening. It will return immediatelly, will
|
||||
// not wait for the backup to stop.
|
||||
// The backup will stop ASAP and the call to CreateNewBackup will
|
||||
// return Status::Incomplete(). It will not clean up after itself, but
|
||||
// the state will remain consistent. The state will be cleaned up
|
||||
// next time you create BackupableDB or RestoreBackupableDB.
|
||||
void StopBackup();
|
||||
|
||||
private:
|
||||
BackupEngine* backup_engine_;
|
||||
};
|
||||
|
||||
// Use this class to access information about backups and restore from them
|
||||
class RestoreBackupableDB {
|
||||
public:
|
||||
RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options);
|
||||
~RestoreBackupableDB();
|
||||
|
||||
// Returns info about backups in backup_info
|
||||
void GetBackupInfo(std::vector<BackupInfo>* backup_info);
|
||||
|
||||
// restore from backup with backup_id
|
||||
// IMPORTANT -- if options_.share_table_files == true and you restore DB
|
||||
// from some backup that is not the latest, and you start creating new
|
||||
// backups from the new DB, they will probably fail
|
||||
//
|
||||
// Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
|
||||
// If you add new data to the DB and try creating a new backup now, the
|
||||
// database will diverge from backups 4 and 5 and the new backup will fail.
|
||||
// If you want to create new backup, you will first have to delete backups 4
|
||||
// and 5.
|
||||
Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
|
||||
const std::string& wal_dir,
|
||||
const RestoreOptions& restore_options =
|
||||
RestoreOptions());
|
||||
|
||||
// restore from the latest backup
|
||||
Status RestoreDBFromLatestBackup(const std::string& db_dir,
|
||||
const std::string& wal_dir,
|
||||
const RestoreOptions& restore_options =
|
||||
RestoreOptions());
|
||||
// deletes old backups, keeping latest num_backups_to_keep alive
|
||||
Status PurgeOldBackups(uint32_t num_backups_to_keep);
|
||||
// deletes a specific backup
|
||||
Status DeleteBackup(BackupID backup_id);
|
||||
|
||||
private:
|
||||
BackupEngine* backup_engine_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
68
include/rocksdb/utilities/db_ttl.h
Normal file
68
include/rocksdb/utilities/db_ttl.h
Normal file
@@ -0,0 +1,68 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "rocksdb/utilities/stackable_db.h"
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Database with TTL support.
|
||||
//
|
||||
// USE-CASES:
|
||||
// This API should be used to open the db when key-values inserted are
|
||||
// meant to be removed from the db in a non-strict 'ttl' amount of time
|
||||
// Therefore, this guarantees that key-values inserted will remain in the
|
||||
// db for >= ttl amount of time and the db will make efforts to remove the
|
||||
// key-values as soon as possible after ttl seconds of their insertion.
|
||||
//
|
||||
// BEHAVIOUR:
|
||||
// TTL is accepted in seconds
|
||||
// (int32_t)Timestamp(creation) is suffixed to values in Put internally
|
||||
// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
|
||||
// Get/Iterator may return expired entries(compaction not run on them yet)
|
||||
// Different TTL may be used during different Opens
|
||||
// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
|
||||
// Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
|
||||
// read_only=true opens in the usual read-only mode. Compactions will not be
|
||||
// triggered(neither manual nor automatic), so no expired entries removed
|
||||
//
|
||||
// CONSTRAINTS:
|
||||
// Not specifying/passing or non-positive TTL behaves like TTL = infinity
|
||||
//
|
||||
// !!!WARNING!!!:
|
||||
// Calling DB::Open directly to re-open a db created by this API will get
|
||||
// corrupt values(timestamp suffixed) and no ttl effect will be there
|
||||
// during the second Open, so use this API consistently to open the db
|
||||
// Be careful when passing ttl with a small positive value because the
|
||||
// whole database may be deleted in a small amount of time
|
||||
|
||||
class DBWithTTL : public StackableDB {
|
||||
public:
|
||||
virtual Status CreateColumnFamilyWithTtl(
|
||||
const ColumnFamilyOptions& options, const std::string& column_family_name,
|
||||
ColumnFamilyHandle** handle, int ttl) = 0;
|
||||
|
||||
static Status Open(const Options& options, const std::string& dbname,
|
||||
DBWithTTL** dbptr, int32_t ttl = 0,
|
||||
bool read_only = false);
|
||||
|
||||
static Status Open(const DBOptions& db_options, const std::string& dbname,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
std::vector<ColumnFamilyHandle*>* handles,
|
||||
DBWithTTL** dbptr, std::vector<int32_t> ttls,
|
||||
bool read_only = false);
|
||||
|
||||
protected:
|
||||
explicit DBWithTTL(DB* db) : StackableDB(db) {}
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
149
include/rocksdb/utilities/document_db.h
Normal file
149
include/rocksdb/utilities/document_db.h
Normal file
@@ -0,0 +1,149 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "rocksdb/utilities/stackable_db.h"
|
||||
#include "rocksdb/utilities/json_document.h"
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// IMPORTANT: DocumentDB is a work in progress. It is unstable and we might
|
||||
// change the API without warning. Talk to RocksDB team before using this in
|
||||
// production ;)
|
||||
|
||||
// DocumentDB is a layer on top of RocksDB that provides a very simple JSON API.
|
||||
// When creating a DB, you specify a list of indexes you want to keep on your
|
||||
// data. You can insert a JSON document to the DB, which is automatically
|
||||
// indexed. Every document added to the DB needs to have "_id" field which is
|
||||
// automatically indexed and is an unique primary key. All other indexes are
|
||||
// non-unique.
|
||||
|
||||
// NOTE: field names in the JSON are NOT allowed to start with '$' or
|
||||
// contain '.'. We don't currently enforce that rule, but will start behaving
|
||||
// badly.
|
||||
|
||||
// Cursor is what you get as a result of executing query. To get all
|
||||
// results from a query, call Next() on a Cursor while Valid() returns true
|
||||
class Cursor {
|
||||
public:
|
||||
Cursor() = default;
|
||||
virtual ~Cursor() {}
|
||||
|
||||
virtual bool Valid() const = 0;
|
||||
virtual void Next() = 0;
|
||||
// Lifecycle of the returned JSONDocument is until the next Next() call
|
||||
virtual const JSONDocument& document() const = 0;
|
||||
virtual Status status() const = 0;
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Cursor(const Cursor&);
|
||||
void operator=(const Cursor&);
|
||||
};
|
||||
|
||||
struct DocumentDBOptions {
|
||||
int background_threads = 4;
|
||||
uint64_t memtable_size = 128 * 1024 * 1024; // 128 MB
|
||||
uint64_t cache_size = 1 * 1024 * 1024 * 1024; // 1 GB
|
||||
};
|
||||
|
||||
// TODO(icanadi) Add `JSONDocument* info` parameter to all calls that can be
|
||||
// used by the caller to get more information about the call execution (number
|
||||
// of dropped records, number of updated records, etc.)
|
||||
class DocumentDB : public StackableDB {
|
||||
public:
|
||||
struct IndexDescriptor {
|
||||
// Currently, you can only define an index on a single field. To specify an
|
||||
// index on a field X, set index description to JSON "{X: 1}"
|
||||
// Currently the value needs to be 1, which means ascending.
|
||||
// In the future, we plan to also support indexes on multiple keys, where
|
||||
// you could mix ascending sorting (1) with descending sorting indexes (-1)
|
||||
JSONDocument* description;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
// Open DocumentDB with specified indexes. The list of indexes has to be
|
||||
// complete, i.e. include all indexes present in the DB, except the primary
|
||||
// key index.
|
||||
// Otherwise, Open() will return an error
|
||||
static Status Open(const DocumentDBOptions& options, const std::string& name,
|
||||
const std::vector<IndexDescriptor>& indexes,
|
||||
DocumentDB** db, bool read_only = false);
|
||||
|
||||
explicit DocumentDB(DB* db) : StackableDB(db) {}
|
||||
|
||||
// Create a new index. It will stop all writes for the duration of the call.
|
||||
// All current documents in the DB are scanned and corresponding index entries
|
||||
// are created
|
||||
virtual Status CreateIndex(const WriteOptions& write_options,
|
||||
const IndexDescriptor& index) = 0;
|
||||
|
||||
// Drop an index. Client is responsible to make sure that index is not being
|
||||
// used by currently executing queries
|
||||
virtual Status DropIndex(const std::string& name) = 0;
|
||||
|
||||
// Insert a document to the DB. The document needs to have a primary key "_id"
|
||||
// which can either be a string or an integer. Otherwise the write will fail
|
||||
// with InvalidArgument.
|
||||
virtual Status Insert(const WriteOptions& options,
|
||||
const JSONDocument& document) = 0;
|
||||
|
||||
// Deletes all documents matching a filter atomically
|
||||
virtual Status Remove(const ReadOptions& read_options,
|
||||
const WriteOptions& write_options,
|
||||
const JSONDocument& query) = 0;
|
||||
|
||||
// Does this sequence of operations:
|
||||
// 1. Find all documents matching a filter
|
||||
// 2. For all documents, atomically:
|
||||
// 2.1. apply the update operators
|
||||
// 2.2. update the secondary indexes
|
||||
//
|
||||
// Currently only $set update operator is supported.
|
||||
// Syntax is: {$set: {key1: value1, key2: value2, etc...}}
|
||||
// This operator will change a document's key1 field to value1, key2 to
|
||||
// value2, etc. New values will be set even if a document didn't have an entry
|
||||
// for the specified key.
|
||||
//
|
||||
// You can not change a primary key of a document.
|
||||
//
|
||||
// Update example: Update({id: {$gt: 5}, $index: id}, {$set: {enabled: true}})
|
||||
virtual Status Update(const ReadOptions& read_options,
|
||||
const WriteOptions& write_options,
|
||||
const JSONDocument& filter,
|
||||
const JSONDocument& updates) = 0;
|
||||
|
||||
// query has to be an array in which every element is an operator. Currently
|
||||
// only $filter operator is supported. Syntax of $filter operator is:
|
||||
// {$filter: {key1: condition1, key2: condition2, etc.}} where conditions can
|
||||
// be either:
|
||||
// 1) a single value in which case the condition is equality condition, or
|
||||
// 2) a defined operators, like {$gt: 4}, which will match all documents that
|
||||
// have key greater than 4.
|
||||
//
|
||||
// Supported operators are:
|
||||
// 1) $gt -- greater than
|
||||
// 2) $gte -- greater than or equal
|
||||
// 3) $lt -- less than
|
||||
// 4) $lte -- less than or equal
|
||||
// If you want the filter to use an index, you need to specify it like this:
|
||||
// {$filter: {...(conditions)..., $index: index_name}}
|
||||
//
|
||||
// Example query:
|
||||
// * [{$filter: {name: John, age: {$gte: 18}, $index: age}}]
|
||||
// will return all Johns whose age is greater or equal to 18 and it will use
|
||||
// index "age" to satisfy the query.
|
||||
virtual Cursor* Query(const ReadOptions& read_options,
|
||||
const JSONDocument& query) = 0;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
105
include/rocksdb/utilities/geo_db.h
Normal file
105
include/rocksdb/utilities/geo_db.h
Normal file
@@ -0,0 +1,105 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "rocksdb/utilities/stackable_db.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
//
|
||||
// Configurable options needed for setting up a Geo database
|
||||
//
|
||||
struct GeoDBOptions {
|
||||
// Backup info and error messages will be written to info_log
|
||||
// if non-nullptr.
|
||||
// Default: nullptr
|
||||
Logger* info_log;
|
||||
|
||||
explicit GeoDBOptions(Logger* _info_log = nullptr):info_log(_info_log) { }
|
||||
};
|
||||
|
||||
//
|
||||
// A position in the earth's geoid
|
||||
//
|
||||
class GeoPosition {
|
||||
public:
|
||||
double latitude;
|
||||
double longitude;
|
||||
|
||||
explicit GeoPosition(double la = 0, double lo = 0) :
|
||||
latitude(la), longitude(lo) {
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Description of an object on the Geoid. It is located by a GPS location,
|
||||
// and is identified by the id. The value associated with this object is
|
||||
// an opaque string 'value'. Different objects identified by unique id's
|
||||
// can have the same gps-location associated with them.
|
||||
//
|
||||
class GeoObject {
|
||||
public:
|
||||
GeoPosition position;
|
||||
std::string id;
|
||||
std::string value;
|
||||
|
||||
GeoObject() {}
|
||||
|
||||
GeoObject(const GeoPosition& pos, const std::string& i,
|
||||
const std::string& val) :
|
||||
position(pos), id(i), value(val) {
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Stack your DB with GeoDB to be able to get geo-spatial support
|
||||
//
|
||||
class GeoDB : public StackableDB {
|
||||
public:
|
||||
// GeoDBOptions have to be the same as the ones used in a previous
|
||||
// incarnation of the DB
|
||||
//
|
||||
// GeoDB owns the pointer `DB* db` now. You should not delete it or
|
||||
// use it after the invocation of GeoDB
|
||||
// GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
|
||||
GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
|
||||
virtual ~GeoDB() {}
|
||||
|
||||
// Insert a new object into the location database. The object is
|
||||
// uniquely identified by the id. If an object with the same id already
|
||||
// exists in the db, then the old one is overwritten by the new
|
||||
// object being inserted here.
|
||||
virtual Status Insert(const GeoObject& object) = 0;
|
||||
|
||||
// Retrieve the value of the object located at the specified GPS
|
||||
// location and is identified by the 'id'.
|
||||
virtual Status GetByPosition(const GeoPosition& pos,
|
||||
const Slice& id, std::string* value) = 0;
|
||||
|
||||
// Retrieve the value of the object identified by the 'id'. This method
|
||||
// could be potentially slower than GetByPosition
|
||||
virtual Status GetById(const Slice& id, GeoObject* object) = 0;
|
||||
|
||||
// Delete the specified object
|
||||
virtual Status Remove(const Slice& id) = 0;
|
||||
|
||||
// Returns a list of all items within a circular radius from the
|
||||
// specified gps location. If 'number_of_values' is specified,
|
||||
// then this call returns at most that many number of objects.
|
||||
// The radius is specified in 'meters'.
|
||||
virtual Status SearchRadial(const GeoPosition& pos,
|
||||
double radius,
|
||||
std::vector<GeoObject>* values,
|
||||
int number_of_values = INT_MAX) = 0;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
174
include/rocksdb/utilities/json_document.h
Normal file
174
include/rocksdb/utilities/json_document.h
Normal file
@@ -0,0 +1,174 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
// We use JSONDocument for DocumentDB API
|
||||
// Implementation inspired by folly::dynamic and rapidjson
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// NOTE: none of this is thread-safe
|
||||
class JSONDocument {
|
||||
public:
|
||||
// return nullptr on parse failure
|
||||
static JSONDocument* ParseJSON(const char* json);
|
||||
|
||||
enum Type {
|
||||
kNull,
|
||||
kArray,
|
||||
kBool,
|
||||
kDouble,
|
||||
kInt64,
|
||||
kObject,
|
||||
kString,
|
||||
};
|
||||
|
||||
JSONDocument(); // null
|
||||
/* implicit */ JSONDocument(bool b);
|
||||
/* implicit */ JSONDocument(double d);
|
||||
/* implicit */ JSONDocument(int64_t i);
|
||||
/* implicit */ JSONDocument(const std::string& s);
|
||||
/* implicit */ JSONDocument(const char* s);
|
||||
// constructs JSONDocument of specific type with default value
|
||||
explicit JSONDocument(Type type);
|
||||
|
||||
// copy constructor
|
||||
JSONDocument(const JSONDocument& json_document);
|
||||
|
||||
~JSONDocument();
|
||||
|
||||
Type type() const;
|
||||
|
||||
// REQUIRES: IsObject()
|
||||
bool Contains(const std::string& key) const;
|
||||
// Returns nullptr if !Contains()
|
||||
// don't delete the returned pointer
|
||||
// REQUIRES: IsObject()
|
||||
const JSONDocument* Get(const std::string& key) const;
|
||||
// REQUIRES: IsObject()
|
||||
JSONDocument& operator[](const std::string& key);
|
||||
// REQUIRES: IsObject()
|
||||
const JSONDocument& operator[](const std::string& key) const;
|
||||
// returns `this`, so you can chain operations.
|
||||
// Copies value
|
||||
// REQUIRES: IsObject()
|
||||
JSONDocument* Set(const std::string& key, const JSONDocument& value);
|
||||
|
||||
// REQUIRES: IsArray() == true || IsObject() == true
|
||||
size_t Count() const;
|
||||
|
||||
// REQUIRES: IsArray()
|
||||
const JSONDocument* GetFromArray(size_t i) const;
|
||||
// REQUIRES: IsArray()
|
||||
JSONDocument& operator[](size_t i);
|
||||
// REQUIRES: IsArray()
|
||||
const JSONDocument& operator[](size_t i) const;
|
||||
// returns `this`, so you can chain operations.
|
||||
// Copies the value
|
||||
// REQUIRES: IsArray() && i < Count()
|
||||
JSONDocument* SetInArray(size_t i, const JSONDocument& value);
|
||||
// REQUIRES: IsArray()
|
||||
JSONDocument* PushBack(const JSONDocument& value);
|
||||
|
||||
bool IsNull() const;
|
||||
bool IsArray() const;
|
||||
bool IsBool() const;
|
||||
bool IsDouble() const;
|
||||
bool IsInt64() const;
|
||||
bool IsObject() const;
|
||||
bool IsString() const;
|
||||
|
||||
// REQUIRES: IsBool() == true
|
||||
bool GetBool() const;
|
||||
// REQUIRES: IsDouble() == true
|
||||
double GetDouble() const;
|
||||
// REQUIRES: IsInt64() == true
|
||||
int64_t GetInt64() const;
|
||||
// REQUIRES: IsString() == true
|
||||
const std::string& GetString() const;
|
||||
|
||||
bool operator==(const JSONDocument& rhs) const;
|
||||
|
||||
std::string DebugString() const;
|
||||
|
||||
private:
|
||||
class ItemsIteratorGenerator;
|
||||
|
||||
public:
|
||||
// REQUIRES: IsObject()
|
||||
ItemsIteratorGenerator Items() const;
|
||||
|
||||
// appends serialized object to dst
|
||||
void Serialize(std::string* dst) const;
|
||||
// returns nullptr if Slice doesn't represent valid serialized JSONDocument
|
||||
static JSONDocument* Deserialize(const Slice& src);
|
||||
|
||||
private:
|
||||
void SerializeInternal(std::string* dst, bool type_prefix) const;
|
||||
// returns false if Slice doesn't represent valid serialized JSONDocument.
|
||||
// Otherwise, true
|
||||
bool DeserializeInternal(Slice* input);
|
||||
|
||||
typedef std::vector<JSONDocument*> Array;
|
||||
typedef std::unordered_map<std::string, JSONDocument*> Object;
|
||||
|
||||
// iteration on objects
|
||||
class const_item_iterator {
|
||||
public:
|
||||
typedef Object::const_iterator It;
|
||||
typedef Object::value_type value_type;
|
||||
/* implicit */ const_item_iterator(It it) : it_(it) {}
|
||||
It& operator++() { return ++it_; }
|
||||
bool operator!=(const const_item_iterator& other) {
|
||||
return it_ != other.it_;
|
||||
}
|
||||
value_type operator*() { return *it_; }
|
||||
|
||||
private:
|
||||
It it_;
|
||||
};
|
||||
class ItemsIteratorGenerator {
|
||||
public:
|
||||
/* implicit */ ItemsIteratorGenerator(const Object& object)
|
||||
: object_(object) {}
|
||||
const_item_iterator begin() { return object_.begin(); }
|
||||
const_item_iterator end() { return object_.end(); }
|
||||
|
||||
private:
|
||||
const Object& object_;
|
||||
};
|
||||
|
||||
union Data {
|
||||
Data() : n(nullptr) {}
|
||||
~Data() {}
|
||||
|
||||
void* n;
|
||||
Array a;
|
||||
bool b;
|
||||
double d;
|
||||
int64_t i;
|
||||
std::string s;
|
||||
Object o;
|
||||
} data_;
|
||||
const Type type_;
|
||||
|
||||
// Our serialization format's first byte specifies the encoding version. That
|
||||
// way, we can easily change our format while providing backwards
|
||||
// compatibility. This constant specifies the current version of the
|
||||
// serialization format
|
||||
static const char kSerializationFormatVersion;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
236
include/rocksdb/utilities/spatial_db.h
Normal file
236
include/rocksdb/utilities/spatial_db.h
Normal file
@@ -0,0 +1,236 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/utilities/stackable_db.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace spatial {
|
||||
|
||||
// NOTE: SpatialDB is experimental and we might change its API without warning.
|
||||
// Please talk to us before developing against SpatialDB API.
|
||||
//
|
||||
// SpatialDB is a support for spatial indexes built on top of RocksDB.
|
||||
// When creating a new SpatialDB, clients specifies a list of spatial indexes to
|
||||
// build on their data. Each spatial index is defined by the area and
|
||||
// granularity. If you're storing map data, different spatial index
|
||||
// granularities can be used for different zoom levels.
|
||||
//
|
||||
// Each element inserted into SpatialDB has:
|
||||
// * a bounding box, which determines how will the element be indexed
|
||||
// * string blob, which will usually be WKB representation of the polygon
|
||||
// (http://en.wikipedia.org/wiki/Well-known_text)
|
||||
// * feature set, which is a map of key-value pairs, where value can be null,
|
||||
// int, double, bool, string
|
||||
// * a list of indexes to insert the element in
|
||||
//
|
||||
// Each query is executed on a single spatial index. Query guarantees that it
|
||||
// will return all elements intersecting the specified bounding box, but it
|
||||
// might also return some extra non-intersecting elements.
|
||||
|
||||
// Variant is a class that can be many things: null, bool, int, double or string
|
||||
// It is used to store different value types in FeatureSet (see below)
|
||||
struct Variant {
|
||||
// Don't change the values here, they are persisted on disk
|
||||
enum Type {
|
||||
kNull = 0x0,
|
||||
kBool = 0x1,
|
||||
kInt = 0x2,
|
||||
kDouble = 0x3,
|
||||
kString = 0x4,
|
||||
};
|
||||
|
||||
Variant() : type_(kNull) {}
|
||||
/* implicit */ Variant(bool b) : type_(kBool) { data_.b = b; }
|
||||
/* implicit */ Variant(uint64_t i) : type_(kInt) { data_.i = i; }
|
||||
/* implicit */ Variant(double d) : type_(kDouble) { data_.d = d; }
|
||||
/* implicit */ Variant(const std::string& s) : type_(kString) {
|
||||
new (&data_.s) std::string(s);
|
||||
}
|
||||
|
||||
Variant(const Variant& v);
|
||||
|
||||
~Variant() {
|
||||
if (type_ == kString) {
|
||||
using std::string;
|
||||
(&data_.s)->~string();
|
||||
}
|
||||
}
|
||||
|
||||
Type type() const { return type_; }
|
||||
bool get_bool() const { return data_.b; }
|
||||
uint64_t get_int() const { return data_.i; }
|
||||
double get_double() const { return data_.d; }
|
||||
const std::string& get_string() const { return data_.s; }
|
||||
|
||||
bool operator==(const Variant& other);
|
||||
bool operator!=(const Variant& other);
|
||||
|
||||
private:
|
||||
Type type_;
|
||||
union Data {
|
||||
Data() {}
|
||||
~Data() {}
|
||||
bool b;
|
||||
uint64_t i;
|
||||
double d;
|
||||
std::string s;
|
||||
} data_;
|
||||
};
|
||||
|
||||
// FeatureSet is a map of key-value pairs. One feature set is associated with
|
||||
// each element in SpatialDB. It can be used to add rich data about the element.
|
||||
class FeatureSet {
|
||||
private:
|
||||
typedef std::unordered_map<std::string, Variant> map;
|
||||
|
||||
public:
|
||||
class iterator {
|
||||
public:
|
||||
/* implicit */ iterator(const map::const_iterator itr) : itr_(itr) {}
|
||||
iterator& operator++() {
|
||||
++itr_;
|
||||
return *this;
|
||||
}
|
||||
bool operator!=(const iterator& other) { return itr_ != other.itr_; }
|
||||
bool operator==(const iterator& other) { return itr_ == other.itr_; }
|
||||
map::value_type operator*() { return *itr_; }
|
||||
|
||||
private:
|
||||
map::const_iterator itr_;
|
||||
};
|
||||
FeatureSet() = default;
|
||||
|
||||
FeatureSet* Set(const std::string& key, const Variant& value);
|
||||
bool Contains(const std::string& key) const;
|
||||
// REQUIRES: Contains(key)
|
||||
const Variant& Get(const std::string& key) const;
|
||||
iterator Find(const std::string& key) const;
|
||||
|
||||
iterator begin() const { return map_.begin(); }
|
||||
iterator end() const { return map_.end(); }
|
||||
|
||||
void Clear();
|
||||
size_t Size() const { return map_.size(); }
|
||||
|
||||
void Serialize(std::string* output) const;
|
||||
// REQUIRED: empty FeatureSet
|
||||
bool Deserialize(const Slice& input);
|
||||
|
||||
std::string DebugString() const;
|
||||
|
||||
private:
|
||||
map map_;
|
||||
};
|
||||
|
||||
// BoundingBox is a helper structure for defining rectangles representing
|
||||
// bounding boxes of spatial elements.
|
||||
template <typename T>
|
||||
struct BoundingBox {
|
||||
T min_x, min_y, max_x, max_y;
|
||||
BoundingBox() = default;
|
||||
BoundingBox(T _min_x, T _min_y, T _max_x, T _max_y)
|
||||
: min_x(_min_x), min_y(_min_y), max_x(_max_x), max_y(_max_y) {}
|
||||
|
||||
bool Intersects(const BoundingBox<T>& a) const {
|
||||
return !(min_x > a.max_x || min_y > a.max_y || a.min_x > max_x ||
|
||||
a.min_y > max_y);
|
||||
}
|
||||
};
|
||||
|
||||
struct SpatialDBOptions {
|
||||
uint64_t cache_size = 1 * 1024 * 1024 * 1024LL; // 1GB
|
||||
int num_threads = 16;
|
||||
bool bulk_load = true;
|
||||
};
|
||||
|
||||
// Cursor is used to return data from the query to the client. To get all the
|
||||
// data from the query, just call Next() while Valid() is true
|
||||
class Cursor {
|
||||
public:
|
||||
Cursor() = default;
|
||||
virtual ~Cursor() {}
|
||||
|
||||
virtual bool Valid() const = 0;
|
||||
// REQUIRES: Valid()
|
||||
virtual void Next() = 0;
|
||||
|
||||
// Lifetime of the underlying storage until the next call to Next()
|
||||
// REQUIRES: Valid()
|
||||
virtual const Slice blob() = 0;
|
||||
// Lifetime of the underlying storage until the next call to Next()
|
||||
// REQUIRES: Valid()
|
||||
virtual const FeatureSet& feature_set() = 0;
|
||||
|
||||
virtual Status status() const = 0;
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Cursor(const Cursor&);
|
||||
void operator=(const Cursor&);
|
||||
};
|
||||
|
||||
// SpatialIndexOptions defines a spatial index that will be built on the data
|
||||
struct SpatialIndexOptions {
|
||||
// Spatial indexes are referenced by names
|
||||
std::string name;
|
||||
// An area that is indexed. If the element is not intersecting with spatial
|
||||
// index's bbox, it will not be inserted into the index
|
||||
BoundingBox<double> bbox;
|
||||
// tile_bits control the granularity of the spatial index. Each dimension of
|
||||
// the bbox will be split into (1 << tile_bits) tiles, so there will be a
|
||||
// total of (1 << tile_bits)^2 tiles. It is recommended to configure a size of
|
||||
// each tile to be approximately the size of the query on that spatial index
|
||||
uint32_t tile_bits;
|
||||
SpatialIndexOptions() {}
|
||||
SpatialIndexOptions(const std::string& _name,
|
||||
const BoundingBox<double>& _bbox, uint32_t _tile_bits)
|
||||
: name(_name), bbox(_bbox), tile_bits(_tile_bits) {}
|
||||
};
|
||||
|
||||
class SpatialDB : public StackableDB {
|
||||
public:
|
||||
// Creates the SpatialDB with specified list of indexes.
|
||||
// REQUIRED: db doesn't exist
|
||||
static Status Create(const SpatialDBOptions& options, const std::string& name,
|
||||
const std::vector<SpatialIndexOptions>& spatial_indexes);
|
||||
|
||||
// Open the existing SpatialDB. The resulting db object will be returned
|
||||
// through db parameter.
|
||||
// REQUIRED: db was created using SpatialDB::Create
|
||||
static Status Open(const SpatialDBOptions& options, const std::string& name,
|
||||
SpatialDB** db, bool read_only = false);
|
||||
|
||||
explicit SpatialDB(DB* db) : StackableDB(db) {}
|
||||
|
||||
// Insert the element into the DB. Element will be inserted into specified
|
||||
// spatial_indexes, based on specified bbox.
|
||||
// REQUIRES: spatial_indexes.size() > 0
|
||||
virtual Status Insert(const WriteOptions& write_options,
|
||||
const BoundingBox<double>& bbox, const Slice& blob,
|
||||
const FeatureSet& feature_set,
|
||||
const std::vector<std::string>& spatial_indexes) = 0;
|
||||
|
||||
// Calling Compact() after inserting a bunch of elements should speed up
|
||||
// reading. This is especially useful if you use SpatialDBOptions::bulk_load
|
||||
virtual Status Compact() = 0;
|
||||
|
||||
// Query the specified spatial_index. Query will return all elements that
|
||||
// intersect bbox, but it may also return some extra elements.
|
||||
virtual Cursor* Query(const ReadOptions& read_options,
|
||||
const BoundingBox<double>& bbox,
|
||||
const std::string& spatial_index) = 0;
|
||||
};
|
||||
|
||||
} // namespace spatial
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
226
include/rocksdb/utilities/stackable_db.h
Normal file
226
include/rocksdb/utilities/stackable_db.h
Normal file
@@ -0,0 +1,226 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
|
||||
class StackableDB : public DB {
|
||||
public:
|
||||
// StackableDB is the owner of db now!
|
||||
explicit StackableDB(DB* db) : db_(db) {}
|
||||
|
||||
~StackableDB() {
|
||||
delete db_;
|
||||
}
|
||||
|
||||
virtual DB* GetBaseDB() {
|
||||
return db_;
|
||||
}
|
||||
|
||||
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
|
||||
const std::string& column_family_name,
|
||||
ColumnFamilyHandle** handle) {
|
||||
return db_->CreateColumnFamily(options, column_family_name, handle);
|
||||
}
|
||||
|
||||
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) {
|
||||
return db_->DropColumnFamily(column_family);
|
||||
}
|
||||
|
||||
using DB::Put;
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& val) override {
|
||||
return db_->Put(options, column_family, key, val);
|
||||
}
|
||||
|
||||
using DB::Get;
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value) override {
|
||||
return db_->Get(options, column_family, key, value);
|
||||
}
|
||||
|
||||
using DB::MultiGet;
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) override {
|
||||
return db_->MultiGet(options, column_family, keys, values);
|
||||
}
|
||||
|
||||
using DB::KeyMayExist;
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr) override {
|
||||
return db_->KeyMayExist(options, column_family, key, value, value_found);
|
||||
}
|
||||
|
||||
using DB::Delete;
|
||||
virtual Status Delete(const WriteOptions& wopts,
|
||||
ColumnFamilyHandle* column_family,
|
||||
const Slice& key) override {
|
||||
return db_->Delete(wopts, column_family, key);
|
||||
}
|
||||
|
||||
using DB::Merge;
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) override {
|
||||
return db_->Merge(options, column_family, key, value);
|
||||
}
|
||||
|
||||
|
||||
virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
|
||||
override {
|
||||
return db_->Write(opts, updates);
|
||||
}
|
||||
|
||||
using DB::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions& opts,
|
||||
ColumnFamilyHandle* column_family) override {
|
||||
return db_->NewIterator(opts, column_family);
|
||||
}
|
||||
|
||||
virtual Status NewIterators(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||
std::vector<Iterator*>* iterators) {
|
||||
return db_->NewIterators(options, column_families, iterators);
|
||||
}
|
||||
|
||||
|
||||
virtual const Snapshot* GetSnapshot() override {
|
||||
return db_->GetSnapshot();
|
||||
}
|
||||
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
|
||||
return db_->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
|
||||
using DB::GetProperty;
|
||||
virtual bool GetProperty(ColumnFamilyHandle* column_family,
|
||||
const Slice& property, std::string* value) override {
|
||||
return db_->GetProperty(column_family, property, value);
|
||||
}
|
||||
|
||||
using DB::GetIntProperty;
|
||||
virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
|
||||
const Slice& property, uint64_t* value) override {
|
||||
return db_->GetIntProperty(column_family, property, value);
|
||||
}
|
||||
|
||||
using DB::GetApproximateSizes;
|
||||
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
||||
const Range* r, int n,
|
||||
uint64_t* sizes) override {
|
||||
return db_->GetApproximateSizes(column_family, r, n, sizes);
|
||||
}
|
||||
|
||||
using DB::CompactRange;
|
||||
virtual Status CompactRange(ColumnFamilyHandle* column_family,
|
||||
const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false, int target_level = -1,
|
||||
uint32_t target_path_id = 0) override {
|
||||
return db_->CompactRange(column_family, begin, end, reduce_level,
|
||||
target_level, target_path_id);
|
||||
}
|
||||
|
||||
using DB::NumberLevels;
|
||||
virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
|
||||
return db_->NumberLevels(column_family);
|
||||
}
|
||||
|
||||
using DB::MaxMemCompactionLevel;
|
||||
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family)
|
||||
override {
|
||||
return db_->MaxMemCompactionLevel(column_family);
|
||||
}
|
||||
|
||||
using DB::Level0StopWriteTrigger;
|
||||
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family)
|
||||
override {
|
||||
return db_->Level0StopWriteTrigger(column_family);
|
||||
}
|
||||
|
||||
virtual const std::string& GetName() const override {
|
||||
return db_->GetName();
|
||||
}
|
||||
|
||||
virtual Env* GetEnv() const override {
|
||||
return db_->GetEnv();
|
||||
}
|
||||
|
||||
using DB::GetOptions;
|
||||
virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
|
||||
override {
|
||||
return db_->GetOptions(column_family);
|
||||
}
|
||||
|
||||
using DB::Flush;
|
||||
virtual Status Flush(const FlushOptions& fopts,
|
||||
ColumnFamilyHandle* column_family) override {
|
||||
return db_->Flush(fopts, column_family);
|
||||
}
|
||||
|
||||
virtual Status DisableFileDeletions() override {
|
||||
return db_->DisableFileDeletions();
|
||||
}
|
||||
|
||||
virtual Status EnableFileDeletions(bool force) override {
|
||||
return db_->EnableFileDeletions(force);
|
||||
}
|
||||
|
||||
virtual void GetLiveFilesMetaData(
|
||||
std::vector<LiveFileMetaData>* metadata) override {
|
||||
db_->GetLiveFilesMetaData(metadata);
|
||||
}
|
||||
|
||||
virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
|
||||
bool flush_memtable = true) override {
|
||||
return db_->GetLiveFiles(vec, mfs, flush_memtable);
|
||||
}
|
||||
|
||||
virtual SequenceNumber GetLatestSequenceNumber() const override {
|
||||
return db_->GetLatestSequenceNumber();
|
||||
}
|
||||
|
||||
virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
|
||||
return db_->GetSortedWalFiles(files);
|
||||
}
|
||||
|
||||
virtual Status DeleteFile(std::string name) override {
|
||||
return db_->DeleteFile(name);
|
||||
}
|
||||
|
||||
virtual Status GetDbIdentity(std::string& identity) {
|
||||
return db_->GetDbIdentity(identity);
|
||||
}
|
||||
|
||||
using DB::GetPropertiesOfAllTables;
|
||||
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
||||
TablePropertiesCollection* props) {
|
||||
return db_->GetPropertiesOfAllTables(column_family, props);
|
||||
}
|
||||
|
||||
virtual Status GetUpdatesSince(
|
||||
SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
|
||||
const TransactionLogIterator::ReadOptions& read_options) override {
|
||||
return db_->GetUpdatesSince(seq_number, iter, read_options);
|
||||
}
|
||||
|
||||
virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
|
||||
return db_->DefaultColumnFamily();
|
||||
}
|
||||
|
||||
protected:
|
||||
DB* db_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
30
include/rocksdb/utilities/utility_db.h
Normal file
30
include/rocksdb/utilities/utility_db.h
Normal file
@@ -0,0 +1,30 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "rocksdb/utilities/stackable_db.h"
|
||||
#include "rocksdb/utilities/db_ttl.h"
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Please don't use this class. It's deprecated
|
||||
class UtilityDB {
|
||||
public:
|
||||
// This function is here only for backwards compatibility. Please use the
|
||||
// functions defined in DBWithTTl (rocksdb/utilities/db_ttl.h)
|
||||
// (deprecated)
|
||||
__attribute__((deprecated)) static Status OpenTtlDB(const Options& options,
|
||||
const std::string& name,
|
||||
StackableDB** dbptr,
|
||||
int32_t ttl = 0,
|
||||
bool read_only = false);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
17
include/rocksdb/version.h
Normal file
17
include/rocksdb/version.h
Normal file
@@ -0,0 +1,17 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once
|
||||
|
||||
// Also update Makefile if you change these
|
||||
#define ROCKSDB_MAJOR 3
|
||||
#define ROCKSDB_MINOR 4
|
||||
#define ROCKSDB_PATCH 0
|
||||
|
||||
// Do not use these. We made the mistake of declaring macros starting with
|
||||
// double underscore. Now we have to live with our choice. We'll deprecate these
|
||||
// at some point
|
||||
#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
|
||||
#define __ROCKSDB_MINOR__ ROCKSDB_MINOR
|
||||
#define __ROCKSDB_PATCH__ ROCKSDB_PATCH
|
||||
162
include/rocksdb/write_batch.h
Normal file
162
include/rocksdb/write_batch.h
Normal file
@@ -0,0 +1,162 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// WriteBatch holds a collection of updates to apply atomically to a DB.
|
||||
//
|
||||
// The updates are applied in the order in which they are added
|
||||
// to the WriteBatch. For example, the value of "key" will be "v3"
|
||||
// after the following batch is written:
|
||||
//
|
||||
// batch.Put("key", "v1");
|
||||
// batch.Delete("key");
|
||||
// batch.Put("key", "v2");
|
||||
// batch.Put("key", "v3");
|
||||
//
|
||||
// Multiple threads can invoke const methods on a WriteBatch without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same WriteBatch must use
|
||||
// external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
|
||||
|
||||
#include <string>
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
class ColumnFamilyHandle;
|
||||
struct SliceParts;
|
||||
|
||||
class WriteBatch {
|
||||
public:
|
||||
explicit WriteBatch(size_t reserved_bytes = 0);
|
||||
~WriteBatch();
|
||||
|
||||
// Store the mapping "key->value" in the database.
|
||||
void Put(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value);
|
||||
void Put(const Slice& key, const Slice& value) {
|
||||
Put(nullptr, key, value);
|
||||
}
|
||||
|
||||
// Variant of Put() that gathers output like writev(2). The key and value
|
||||
// that will be written to the database are concatentations of arrays of
|
||||
// slices.
|
||||
void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
|
||||
const SliceParts& value);
|
||||
void Put(const SliceParts& key, const SliceParts& value) {
|
||||
Put(nullptr, key, value);
|
||||
}
|
||||
|
||||
// Merge "value" with the existing value of "key" in the database.
|
||||
// "key->merge(existing, value)"
|
||||
void Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value);
|
||||
void Merge(const Slice& key, const Slice& value) {
|
||||
Merge(nullptr, key, value);
|
||||
}
|
||||
|
||||
// If the database contains a mapping for "key", erase it. Else do nothing.
|
||||
void Delete(ColumnFamilyHandle* column_family, const Slice& key);
|
||||
void Delete(const Slice& key) { Delete(nullptr, key); }
|
||||
|
||||
// variant that takes SliceParts
|
||||
void Delete(ColumnFamilyHandle* column_family, const SliceParts& key);
|
||||
void Delete(const SliceParts& key) { Delete(nullptr, key); }
|
||||
|
||||
// Append a blob of arbitrary size to the records in this batch. The blob will
|
||||
// be stored in the transaction log but not in any other file. In particular,
|
||||
// it will not be persisted to the SST files. When iterating over this
|
||||
// WriteBatch, WriteBatch::Handler::LogData will be called with the contents
|
||||
// of the blob as it is encountered. Blobs, puts, deletes, and merges will be
|
||||
// encountered in the same order in thich they were inserted. The blob will
|
||||
// NOT consume sequence number(s) and will NOT increase the count of the batch
|
||||
//
|
||||
// Example application: add timestamps to the transaction log for use in
|
||||
// replication.
|
||||
void PutLogData(const Slice& blob);
|
||||
|
||||
// Clear all updates buffered in this batch.
|
||||
void Clear();
|
||||
|
||||
// Support for iterating over the contents of a batch.
|
||||
class Handler {
|
||||
public:
|
||||
virtual ~Handler();
|
||||
// default implementation will just call Put without column family for
|
||||
// backwards compatibility. If the column family is not default,
|
||||
// the function is noop
|
||||
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) {
|
||||
if (column_family_id == 0) {
|
||||
// Put() historically doesn't return status. We didn't want to be
|
||||
// backwards incompatible so we didn't change the return status
|
||||
// (this is a public API). We do an ordinary get and return Status::OK()
|
||||
Put(key, value);
|
||||
return Status::OK();
|
||||
}
|
||||
return Status::InvalidArgument(
|
||||
"non-default column family and PutCF not implemented");
|
||||
}
|
||||
virtual void Put(const Slice& key, const Slice& value);
|
||||
// Merge and LogData are not pure virtual. Otherwise, we would break
|
||||
// existing clients of Handler on a source code level. The default
|
||||
// implementation of Merge simply throws a runtime exception.
|
||||
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) {
|
||||
if (column_family_id == 0) {
|
||||
Merge(key, value);
|
||||
return Status::OK();
|
||||
}
|
||||
return Status::InvalidArgument(
|
||||
"non-default column family and MergeCF not implemented");
|
||||
}
|
||||
virtual void Merge(const Slice& key, const Slice& value);
|
||||
// The default implementation of LogData does nothing.
|
||||
virtual void LogData(const Slice& blob);
|
||||
virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
|
||||
if (column_family_id == 0) {
|
||||
Delete(key);
|
||||
return Status::OK();
|
||||
}
|
||||
return Status::InvalidArgument(
|
||||
"non-default column family and DeleteCF not implemented");
|
||||
}
|
||||
virtual void Delete(const Slice& key);
|
||||
// Continue is called by WriteBatch::Iterate. If it returns false,
|
||||
// iteration is halted. Otherwise, it continues iterating. The default
|
||||
// implementation always returns true.
|
||||
virtual bool Continue();
|
||||
};
|
||||
Status Iterate(Handler* handler) const;
|
||||
|
||||
// Retrieve the serialized version of this batch.
|
||||
const std::string& Data() const { return rep_; }
|
||||
|
||||
// Retrieve data size of the batch.
|
||||
size_t GetDataSize() const { return rep_.size(); }
|
||||
|
||||
// Returns the number of updates in the batch
|
||||
int Count() const;
|
||||
|
||||
// Constructor with a serialized string object
|
||||
explicit WriteBatch(std::string rep): rep_(rep) {}
|
||||
|
||||
private:
|
||||
friend class WriteBatchInternal;
|
||||
|
||||
std::string rep_; // See comment in write_batch.cc for the format of rep_
|
||||
|
||||
// Intentionally copyable
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
|
||||
12
include/utilities/backupable_db.h
Normal file
12
include/utilities/backupable_db.h
Normal file
@@ -0,0 +1,12 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#warning This file was moved to rocksdb/utilities/backupable_db.h
|
||||
#include "rocksdb/utilities/backupable_db.h"
|
||||
8
include/utilities/db_ttl.h
Normal file
8
include/utilities/db_ttl.h
Normal file
@@ -0,0 +1,8 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
#warning This file was moved to rocksdb/utilities/db_ttl.h
|
||||
#include "rocksdb/utilities/db_ttl.h"
|
||||
8
include/utilities/document_db.h
Normal file
8
include/utilities/document_db.h
Normal file
@@ -0,0 +1,8 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
#warning This file was moved to rocksdb/utilities/document_db.h
|
||||
#include "rocksdb/utilities/document_db.h"
|
||||
8
include/utilities/geo_db.h
Normal file
8
include/utilities/geo_db.h
Normal file
@@ -0,0 +1,8 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
#warning This file was moved to rocksdb/utilities/geo_db.h
|
||||
#include "rocksdb/utilities/geo_db.h"
|
||||
7
include/utilities/json_document.h
Normal file
7
include/utilities/json_document.h
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once
|
||||
#warning This file was moved to rocksdb/utilities/json_document.h
|
||||
#include "rocksdb/utilities/json_document.h"
|
||||
7
include/utilities/stackable_db.h
Normal file
7
include/utilities/stackable_db.h
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#warning This file was moved to rocksdb/utilities/stackable_db.h
|
||||
#include "rocksdb/utilities/stackable_db.h"
|
||||
7
include/utilities/utility_db.h
Normal file
7
include/utilities/utility_db.h
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#warning This file was moved to rocksdb/utilities/utility_db.h
|
||||
#include "rocksdb/utilities/utility_db.h"
|
||||
Reference in New Issue
Block a user